paasta-tools 1.21.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. k8s_itests/__init__.py +0 -0
  2. k8s_itests/test_autoscaling.py +23 -0
  3. k8s_itests/utils.py +38 -0
  4. paasta_tools/__init__.py +20 -0
  5. paasta_tools/adhoc_tools.py +142 -0
  6. paasta_tools/api/__init__.py +13 -0
  7. paasta_tools/api/api.py +330 -0
  8. paasta_tools/api/api_docs/swagger.json +2323 -0
  9. paasta_tools/api/client.py +106 -0
  10. paasta_tools/api/settings.py +33 -0
  11. paasta_tools/api/tweens/__init__.py +6 -0
  12. paasta_tools/api/tweens/auth.py +125 -0
  13. paasta_tools/api/tweens/profiling.py +108 -0
  14. paasta_tools/api/tweens/request_logger.py +124 -0
  15. paasta_tools/api/views/__init__.py +13 -0
  16. paasta_tools/api/views/autoscaler.py +100 -0
  17. paasta_tools/api/views/exception.py +45 -0
  18. paasta_tools/api/views/flink.py +73 -0
  19. paasta_tools/api/views/instance.py +395 -0
  20. paasta_tools/api/views/pause_autoscaler.py +71 -0
  21. paasta_tools/api/views/remote_run.py +113 -0
  22. paasta_tools/api/views/resources.py +76 -0
  23. paasta_tools/api/views/service.py +35 -0
  24. paasta_tools/api/views/version.py +25 -0
  25. paasta_tools/apply_external_resources.py +79 -0
  26. paasta_tools/async_utils.py +109 -0
  27. paasta_tools/autoscaling/__init__.py +0 -0
  28. paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
  29. paasta_tools/autoscaling/forecasting.py +106 -0
  30. paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
  31. paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
  32. paasta_tools/autoscaling/utils.py +52 -0
  33. paasta_tools/bounce_lib.py +184 -0
  34. paasta_tools/broadcast_log_to_services.py +62 -0
  35. paasta_tools/cassandracluster_tools.py +210 -0
  36. paasta_tools/check_autoscaler_max_instances.py +212 -0
  37. paasta_tools/check_cassandracluster_services_replication.py +35 -0
  38. paasta_tools/check_flink_services_health.py +203 -0
  39. paasta_tools/check_kubernetes_api.py +57 -0
  40. paasta_tools/check_kubernetes_services_replication.py +141 -0
  41. paasta_tools/check_oom_events.py +244 -0
  42. paasta_tools/check_services_replication_tools.py +324 -0
  43. paasta_tools/check_spark_jobs.py +234 -0
  44. paasta_tools/cleanup_kubernetes_cr.py +138 -0
  45. paasta_tools/cleanup_kubernetes_crd.py +145 -0
  46. paasta_tools/cleanup_kubernetes_jobs.py +344 -0
  47. paasta_tools/cleanup_tron_namespaces.py +96 -0
  48. paasta_tools/cli/__init__.py +13 -0
  49. paasta_tools/cli/authentication.py +85 -0
  50. paasta_tools/cli/cli.py +260 -0
  51. paasta_tools/cli/cmds/__init__.py +13 -0
  52. paasta_tools/cli/cmds/autoscale.py +143 -0
  53. paasta_tools/cli/cmds/check.py +334 -0
  54. paasta_tools/cli/cmds/cook_image.py +147 -0
  55. paasta_tools/cli/cmds/get_docker_image.py +76 -0
  56. paasta_tools/cli/cmds/get_image_version.py +172 -0
  57. paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
  58. paasta_tools/cli/cmds/info.py +155 -0
  59. paasta_tools/cli/cmds/itest.py +117 -0
  60. paasta_tools/cli/cmds/list.py +66 -0
  61. paasta_tools/cli/cmds/list_clusters.py +42 -0
  62. paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
  63. paasta_tools/cli/cmds/list_namespaces.py +84 -0
  64. paasta_tools/cli/cmds/local_run.py +1396 -0
  65. paasta_tools/cli/cmds/logs.py +1601 -0
  66. paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
  67. paasta_tools/cli/cmds/mesh_status.py +174 -0
  68. paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
  69. paasta_tools/cli/cmds/push_to_registry.py +275 -0
  70. paasta_tools/cli/cmds/remote_run.py +252 -0
  71. paasta_tools/cli/cmds/rollback.py +347 -0
  72. paasta_tools/cli/cmds/secret.py +549 -0
  73. paasta_tools/cli/cmds/security_check.py +59 -0
  74. paasta_tools/cli/cmds/spark_run.py +1400 -0
  75. paasta_tools/cli/cmds/start_stop_restart.py +401 -0
  76. paasta_tools/cli/cmds/status.py +2302 -0
  77. paasta_tools/cli/cmds/validate.py +1012 -0
  78. paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
  79. paasta_tools/cli/fsm/__init__.py +13 -0
  80. paasta_tools/cli/fsm/autosuggest.py +82 -0
  81. paasta_tools/cli/fsm/template/README.md +8 -0
  82. paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
  83. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
  84. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
  85. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
  86. paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
  87. paasta_tools/cli/fsm_cmd.py +121 -0
  88. paasta_tools/cli/paasta_tabcomplete.sh +23 -0
  89. paasta_tools/cli/schemas/adhoc_schema.json +199 -0
  90. paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
  91. paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
  92. paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
  93. paasta_tools/cli/schemas/deploy_schema.json +173 -0
  94. paasta_tools/cli/schemas/eks_schema.json +970 -0
  95. paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
  96. paasta_tools/cli/schemas/rollback_schema.json +160 -0
  97. paasta_tools/cli/schemas/service_schema.json +25 -0
  98. paasta_tools/cli/schemas/smartstack_schema.json +322 -0
  99. paasta_tools/cli/schemas/tron_schema.json +699 -0
  100. paasta_tools/cli/utils.py +1118 -0
  101. paasta_tools/clusterman.py +21 -0
  102. paasta_tools/config_utils.py +385 -0
  103. paasta_tools/contrib/__init__.py +0 -0
  104. paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
  105. paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
  106. paasta_tools/contrib/check_orphans.py +306 -0
  107. paasta_tools/contrib/create_dynamodb_table.py +35 -0
  108. paasta_tools/contrib/create_paasta_playground.py +105 -0
  109. paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
  110. paasta_tools/contrib/get_running_task_allocation.py +346 -0
  111. paasta_tools/contrib/habitat_fixer.py +86 -0
  112. paasta_tools/contrib/ide_helper.py +316 -0
  113. paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
  114. paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
  115. paasta_tools/contrib/kill_bad_containers.py +109 -0
  116. paasta_tools/contrib/mass-deploy-tag.sh +44 -0
  117. paasta_tools/contrib/mock_patch_checker.py +86 -0
  118. paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
  119. paasta_tools/contrib/render_template.py +129 -0
  120. paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
  121. paasta_tools/contrib/service_shard_remove.py +157 -0
  122. paasta_tools/contrib/service_shard_update.py +373 -0
  123. paasta_tools/contrib/shared_ip_check.py +77 -0
  124. paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
  125. paasta_tools/delete_kubernetes_deployments.py +89 -0
  126. paasta_tools/deployment_utils.py +44 -0
  127. paasta_tools/docker_wrapper.py +234 -0
  128. paasta_tools/docker_wrapper_imports.py +13 -0
  129. paasta_tools/drain_lib.py +351 -0
  130. paasta_tools/dump_locally_running_services.py +71 -0
  131. paasta_tools/eks_tools.py +119 -0
  132. paasta_tools/envoy_tools.py +373 -0
  133. paasta_tools/firewall.py +504 -0
  134. paasta_tools/firewall_logging.py +154 -0
  135. paasta_tools/firewall_update.py +172 -0
  136. paasta_tools/flink_tools.py +345 -0
  137. paasta_tools/flinkeks_tools.py +90 -0
  138. paasta_tools/frameworks/__init__.py +0 -0
  139. paasta_tools/frameworks/adhoc_scheduler.py +71 -0
  140. paasta_tools/frameworks/constraints.py +87 -0
  141. paasta_tools/frameworks/native_scheduler.py +652 -0
  142. paasta_tools/frameworks/native_service_config.py +301 -0
  143. paasta_tools/frameworks/task_store.py +245 -0
  144. paasta_tools/generate_all_deployments +9 -0
  145. paasta_tools/generate_authenticating_services.py +94 -0
  146. paasta_tools/generate_deployments_for_service.py +255 -0
  147. paasta_tools/generate_services_file.py +114 -0
  148. paasta_tools/generate_services_yaml.py +30 -0
  149. paasta_tools/hacheck.py +76 -0
  150. paasta_tools/instance/__init__.py +0 -0
  151. paasta_tools/instance/hpa_metrics_parser.py +122 -0
  152. paasta_tools/instance/kubernetes.py +1362 -0
  153. paasta_tools/iptables.py +240 -0
  154. paasta_tools/kafkacluster_tools.py +143 -0
  155. paasta_tools/kubernetes/__init__.py +0 -0
  156. paasta_tools/kubernetes/application/__init__.py +0 -0
  157. paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
  158. paasta_tools/kubernetes/application/tools.py +90 -0
  159. paasta_tools/kubernetes/bin/__init__.py +0 -0
  160. paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
  161. paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
  162. paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
  163. paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
  164. paasta_tools/kubernetes/remote_run.py +558 -0
  165. paasta_tools/kubernetes_tools.py +4679 -0
  166. paasta_tools/list_kubernetes_service_instances.py +128 -0
  167. paasta_tools/list_tron_namespaces.py +60 -0
  168. paasta_tools/long_running_service_tools.py +678 -0
  169. paasta_tools/mac_address.py +44 -0
  170. paasta_tools/marathon_dashboard.py +0 -0
  171. paasta_tools/mesos/__init__.py +0 -0
  172. paasta_tools/mesos/cfg.py +46 -0
  173. paasta_tools/mesos/cluster.py +60 -0
  174. paasta_tools/mesos/exceptions.py +59 -0
  175. paasta_tools/mesos/framework.py +77 -0
  176. paasta_tools/mesos/log.py +48 -0
  177. paasta_tools/mesos/master.py +306 -0
  178. paasta_tools/mesos/mesos_file.py +169 -0
  179. paasta_tools/mesos/parallel.py +52 -0
  180. paasta_tools/mesos/slave.py +115 -0
  181. paasta_tools/mesos/task.py +94 -0
  182. paasta_tools/mesos/util.py +69 -0
  183. paasta_tools/mesos/zookeeper.py +37 -0
  184. paasta_tools/mesos_maintenance.py +848 -0
  185. paasta_tools/mesos_tools.py +1051 -0
  186. paasta_tools/metrics/__init__.py +0 -0
  187. paasta_tools/metrics/metastatus_lib.py +1110 -0
  188. paasta_tools/metrics/metrics_lib.py +217 -0
  189. paasta_tools/monitoring/__init__.py +13 -0
  190. paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
  191. paasta_tools/monitoring_tools.py +652 -0
  192. paasta_tools/monkrelaycluster_tools.py +146 -0
  193. paasta_tools/nrtsearchservice_tools.py +143 -0
  194. paasta_tools/nrtsearchserviceeks_tools.py +68 -0
  195. paasta_tools/oom_logger.py +321 -0
  196. paasta_tools/paasta_deploy_tron_jobs +3 -0
  197. paasta_tools/paasta_execute_docker_command.py +123 -0
  198. paasta_tools/paasta_native_serviceinit.py +21 -0
  199. paasta_tools/paasta_service_config_loader.py +201 -0
  200. paasta_tools/paastaapi/__init__.py +29 -0
  201. paasta_tools/paastaapi/api/__init__.py +3 -0
  202. paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
  203. paasta_tools/paastaapi/api/default_api.py +569 -0
  204. paasta_tools/paastaapi/api/remote_run_api.py +604 -0
  205. paasta_tools/paastaapi/api/resources_api.py +157 -0
  206. paasta_tools/paastaapi/api/service_api.py +1736 -0
  207. paasta_tools/paastaapi/api_client.py +818 -0
  208. paasta_tools/paastaapi/apis/__init__.py +22 -0
  209. paasta_tools/paastaapi/configuration.py +455 -0
  210. paasta_tools/paastaapi/exceptions.py +137 -0
  211. paasta_tools/paastaapi/model/__init__.py +5 -0
  212. paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
  213. paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
  214. paasta_tools/paastaapi/model/deploy_queue.py +178 -0
  215. paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
  216. paasta_tools/paastaapi/model/envoy_backend.py +185 -0
  217. paasta_tools/paastaapi/model/envoy_location.py +184 -0
  218. paasta_tools/paastaapi/model/envoy_status.py +181 -0
  219. paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
  220. paasta_tools/paastaapi/model/flink_config.py +173 -0
  221. paasta_tools/paastaapi/model/flink_job.py +186 -0
  222. paasta_tools/paastaapi/model/flink_job_details.py +192 -0
  223. paasta_tools/paastaapi/model/flink_jobs.py +175 -0
  224. paasta_tools/paastaapi/model/float_and_error.py +173 -0
  225. paasta_tools/paastaapi/model/hpa_metric.py +176 -0
  226. paasta_tools/paastaapi/model/inline_object.py +170 -0
  227. paasta_tools/paastaapi/model/inline_response200.py +170 -0
  228. paasta_tools/paastaapi/model/inline_response2001.py +170 -0
  229. paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
  230. paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
  231. paasta_tools/paastaapi/model/instance_status.py +220 -0
  232. paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
  233. paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
  234. paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
  235. paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
  236. paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
  237. paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
  238. paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
  239. paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
  240. paasta_tools/paastaapi/model/instance_tasks.py +182 -0
  241. paasta_tools/paastaapi/model/integer_and_error.py +173 -0
  242. paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
  243. paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
  244. paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
  245. paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
  246. paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
  247. paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
  248. paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
  249. paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
  250. paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
  251. paasta_tools/paastaapi/model/remote_run_start.py +185 -0
  252. paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
  253. paasta_tools/paastaapi/model/remote_run_token.py +173 -0
  254. paasta_tools/paastaapi/model/resource.py +187 -0
  255. paasta_tools/paastaapi/model/resource_item.py +187 -0
  256. paasta_tools/paastaapi/model/resource_value.py +176 -0
  257. paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
  258. paasta_tools/paastaapi/model/smartstack_location.py +181 -0
  259. paasta_tools/paastaapi/model/smartstack_status.py +181 -0
  260. paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
  261. paasta_tools/paastaapi/model_utils.py +1879 -0
  262. paasta_tools/paastaapi/models/__init__.py +62 -0
  263. paasta_tools/paastaapi/rest.py +287 -0
  264. paasta_tools/prune_completed_pods.py +220 -0
  265. paasta_tools/puppet_service_tools.py +59 -0
  266. paasta_tools/py.typed +1 -0
  267. paasta_tools/remote_git.py +127 -0
  268. paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
  269. paasta_tools/run-paasta-api-playground.py +51 -0
  270. paasta_tools/secret_providers/__init__.py +66 -0
  271. paasta_tools/secret_providers/vault.py +214 -0
  272. paasta_tools/secret_tools.py +277 -0
  273. paasta_tools/setup_istio_mesh.py +353 -0
  274. paasta_tools/setup_kubernetes_cr.py +412 -0
  275. paasta_tools/setup_kubernetes_crd.py +138 -0
  276. paasta_tools/setup_kubernetes_internal_crd.py +154 -0
  277. paasta_tools/setup_kubernetes_job.py +353 -0
  278. paasta_tools/setup_prometheus_adapter_config.py +1028 -0
  279. paasta_tools/setup_tron_namespace.py +248 -0
  280. paasta_tools/slack.py +75 -0
  281. paasta_tools/smartstack_tools.py +676 -0
  282. paasta_tools/spark_tools.py +283 -0
  283. paasta_tools/synapse_srv_namespaces_fact.py +42 -0
  284. paasta_tools/tron/__init__.py +0 -0
  285. paasta_tools/tron/client.py +158 -0
  286. paasta_tools/tron/tron_command_context.py +194 -0
  287. paasta_tools/tron/tron_timeutils.py +101 -0
  288. paasta_tools/tron_tools.py +1448 -0
  289. paasta_tools/utils.py +4307 -0
  290. paasta_tools/yaml_tools.py +44 -0
  291. paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
  292. paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
  293. paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
  294. paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
  295. paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
  296. paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
  297. paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
  298. paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
  299. paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
  300. paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
  301. paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
  302. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
  303. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
  304. paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
  305. paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
  306. paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
  307. paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
  308. paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
  309. paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
  310. paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
  311. paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
  312. paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
  313. paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
  314. paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
  315. paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
  316. paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
  317. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
  318. paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
  319. paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
  320. paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
  321. paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
  322. paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
  323. paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
  324. paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
  325. paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
  326. paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
  327. paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
  328. paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
  329. paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
  330. paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
  331. paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
  332. paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
  333. paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
  334. paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
  335. paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
  336. paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
  337. paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
  338. paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
  339. paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
  340. paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
  341. paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
  342. paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
  343. paasta_tools-1.21.3.dist-info/LICENSE +201 -0
  344. paasta_tools-1.21.3.dist-info/METADATA +74 -0
  345. paasta_tools-1.21.3.dist-info/RECORD +348 -0
  346. paasta_tools-1.21.3.dist-info/WHEEL +5 -0
  347. paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
  348. paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2019 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Usage: ./check_cassandracluster_services_replication.py [options]
17
+ """
18
+ import logging
19
+
20
+ from paasta_tools import cassandracluster_tools
21
+ from paasta_tools.check_kubernetes_services_replication import (
22
+ check_kubernetes_pod_replication,
23
+ )
24
+ from paasta_tools.check_services_replication_tools import main
25
+
26
+
27
+ log = logging.getLogger(__name__)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main(
32
+ cassandracluster_tools.CassandraClusterDeploymentConfig,
33
+ check_kubernetes_pod_replication,
34
+ namespace="paasta-cassandraclusters",
35
+ )
@@ -0,0 +1,203 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2019 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Usage: ./check_flink_services_health.py [options]
17
+ """
18
+ import datetime
19
+ import logging
20
+ from typing import Dict
21
+ from typing import List
22
+ from typing import Sequence
23
+ from typing import Tuple
24
+
25
+ import pysensu_yelp
26
+
27
+ from paasta_tools import flink_tools
28
+ from paasta_tools import flinkeks_tools
29
+ from paasta_tools.check_services_replication_tools import main
30
+ from paasta_tools.check_services_replication_tools import parse_args
31
+ from paasta_tools.flink_tools import FlinkDeploymentConfig
32
+ from paasta_tools.kubernetes_tools import is_pod_ready
33
+ from paasta_tools.kubernetes_tools import V1Pod
34
+ from paasta_tools.monitoring_tools import check_under_replication
35
+ from paasta_tools.monitoring_tools import send_replication_event
36
+ from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
37
+ from paasta_tools.utils import is_under_replicated
38
+
39
+
40
+ log = logging.getLogger(__name__)
41
+
42
+
43
+ def container_lifetime(
44
+ pod: V1Pod,
45
+ ) -> datetime.timedelta:
46
+ """Return a time duration for how long the pod is alive"""
47
+ st = pod.status.start_time
48
+ return datetime.datetime.now(st.tzinfo) - st
49
+
50
+
51
+ def healthy_flink_containers_cnt(si_pods: Sequence[V1Pod], container_type: str) -> int:
52
+ """Return count of healthy Flink containers with given type"""
53
+ return len(
54
+ [
55
+ pod
56
+ for pod in si_pods
57
+ if pod.metadata.labels["flink.yelp.com/container-type"] == container_type
58
+ and is_pod_ready(pod)
59
+ and container_lifetime(pod).total_seconds() > 60
60
+ ]
61
+ )
62
+
63
+
64
+ def check_under_registered_taskmanagers(
65
+ instance_config: FlinkDeploymentConfig,
66
+ expected_count: int,
67
+ cr_name: str,
68
+ is_eks: bool,
69
+ ) -> Tuple[bool, str, str]:
70
+ """Check if not enough taskmanagers have been registered to the jobmanager and
71
+ returns both the result of the check in the form of a boolean and a human-readable
72
+ text to be used in logging or monitoring events.
73
+ """
74
+ unhealthy = True
75
+ if cr_name != "":
76
+ try:
77
+ overview = flink_tools.get_flink_jobmanager_overview(
78
+ cr_name, instance_config.cluster, is_eks
79
+ )
80
+ num_reported = overview.get("taskmanagers", 0)
81
+ crit_threshold = instance_config.get_replication_crit_percentage()
82
+ output = (
83
+ f"{instance_config.job_id} has {num_reported}/{expected_count} "
84
+ f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)"
85
+ )
86
+ unhealthy, _ = is_under_replicated(
87
+ num_reported, expected_count, crit_threshold
88
+ )
89
+ except ValueError as e:
90
+ output = (
91
+ f"Dashboard of service {instance_config.job_id} is not available ({e})"
92
+ )
93
+ else:
94
+ output = f"Dashboard of service {instance_config.job_id} is not available"
95
+ if unhealthy:
96
+ description = f"""
97
+ This alert means that the Flink dashboard is not reporting the expected
98
+ number of taskmanagers.
99
+
100
+ Reasons this might be happening:
101
+
102
+ The service may simply be unhealthy. There also may not be enough resources
103
+ in the cluster to support the requested instance count.
104
+
105
+ Things you can do:
106
+
107
+ * Fix the cause of the unhealthy service. Try running:
108
+
109
+ paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv
110
+
111
+ """
112
+ else:
113
+ description = f"{instance_config.job_id} taskmanager is available"
114
+ return unhealthy, output, description
115
+
116
+
117
+ def get_cr_name(si_pods: Sequence[V1Pod]) -> str:
118
+ """Returns the flink custom resource name based on the pod name. We are randomly choosing jobmanager pod here.
119
+ This change is related to FLINK-3129
120
+ """
121
+ jobmanager_pod = [
122
+ pod
123
+ for pod in si_pods
124
+ if pod.metadata.labels["flink.yelp.com/container-type"] == "jobmanager"
125
+ and is_pod_ready(pod)
126
+ and container_lifetime(pod).total_seconds() > 60
127
+ ]
128
+ if len(jobmanager_pod) == 1:
129
+ return jobmanager_pod[0].metadata.name.split("-jobmanager-")[0]
130
+ else:
131
+ return ""
132
+
133
+
134
+ def check_flink_service_health(
135
+ instance_config: FlinkDeploymentConfig,
136
+ pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
137
+ replication_checker: KubeSmartstackEnvoyReplicationChecker,
138
+ dry_run: bool = False,
139
+ ) -> None:
140
+ si_pods = pods_by_service_instance.get(instance_config.service, {}).get(
141
+ instance_config.instance, []
142
+ )
143
+ taskmanagers_expected_cnt = instance_config.config_dict.get(
144
+ "taskmanager", {"instances": 10}
145
+ ).get("instances", 10)
146
+ num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
147
+ num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
148
+ num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")
149
+
150
+ service_cr_name = get_cr_name(si_pods)
151
+
152
+ results = [
153
+ check_under_replication(
154
+ instance_config=instance_config,
155
+ expected_count=1,
156
+ num_available=num_healthy_supervisors,
157
+ sub_component="supervisor",
158
+ ),
159
+ check_under_replication(
160
+ instance_config=instance_config,
161
+ expected_count=1,
162
+ num_available=num_healthy_jobmanagers,
163
+ sub_component="jobmanager",
164
+ ),
165
+ check_under_replication(
166
+ instance_config=instance_config,
167
+ expected_count=taskmanagers_expected_cnt,
168
+ num_available=num_healthy_taskmanagers,
169
+ sub_component="taskmanager",
170
+ ),
171
+ check_under_registered_taskmanagers(
172
+ instance_config=instance_config,
173
+ expected_count=taskmanagers_expected_cnt,
174
+ cr_name=service_cr_name,
175
+ is_eks=isinstance(instance_config, flinkeks_tools.FlinkEksDeploymentConfig),
176
+ ),
177
+ ]
178
+ output = ", ".join([r[1] for r in results])
179
+ description = "\n########\n".join([r[2] for r in results])
180
+ if any(r[0] for r in results):
181
+ log.error(output)
182
+ status = pysensu_yelp.Status.CRITICAL
183
+ else:
184
+ log.info(output)
185
+ status = pysensu_yelp.Status.OK
186
+ send_replication_event(
187
+ instance_config=instance_config,
188
+ status=status,
189
+ output=output,
190
+ description=description,
191
+ dry_run=dry_run,
192
+ )
193
+
194
+
195
+ if __name__ == "__main__":
196
+ args = parse_args()
197
+ main(
198
+ instance_type_class=flinkeks_tools.FlinkEksDeploymentConfig
199
+ if args.eks
200
+ else flink_tools.FlinkDeploymentConfig,
201
+ check_service_replication=check_flink_service_health,
202
+ namespace="paasta-flinks",
203
+ )
@@ -0,0 +1,57 @@
1
+ #!/opt/venvs/paasta-tools/bin/python
2
+ # Copyright 2015-2016 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Usage: ./check_kubernetes_api.py [options]
17
+
18
+ This is a script that checks connectivity and credentials for Kubernetes API.
19
+ """
20
+ import argparse
21
+ import logging
22
+ import sys
23
+
24
+ from paasta_tools.kubernetes_tools import KubeClient
25
+
26
+
27
+ log = logging.getLogger(__name__)
28
+
29
+
30
+ def parse_args():
31
+ parser = argparse.ArgumentParser()
32
+ parser.add_argument(
33
+ "-v", "--verbose", action="store_true", dest="verbose", default=False
34
+ )
35
+ options = parser.parse_args()
36
+ return options
37
+
38
+
39
+ def main() -> None:
40
+ args = parse_args()
41
+ if args.verbose:
42
+ logging.basicConfig(level=logging.DEBUG)
43
+ else:
44
+ logging.basicConfig(level=logging.WARNING)
45
+
46
+ kube_client = KubeClient()
47
+ try:
48
+ kube_client.core.list_namespace()
49
+ log.info("API is ok")
50
+ sys.exit(0)
51
+ except Exception as exc:
52
+ log.error(f"Error connecting to API: {exc}")
53
+ sys.exit(1)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2019 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Usage: ./check_kubernetes_services_replication.py [options]
17
+
18
+ This is a script that checks the number of HAProxy backends via Synapse against
19
+ the expected amount that should've been deployed via Kubernetes.
20
+
21
+ Basically, the script checks smartstack.yaml for listed namespaces, and then queries
22
+ Synapse for the number of available backends for that namespace. It then goes through
23
+ the Kubernetes service configuration file for that cluster, and sees how many instances
24
+ are expected to be available for that namespace based on the number of instances deployed
25
+ on that namespace.
26
+
27
+ After retrieving that information, a fraction of available instances is calculated
28
+ (available/expected), and then compared against a threshold. The default threshold is
29
+ 50, meaning if less than 50% of a service's backends are available, the script sends
30
+ CRITICAL. If replication_threshold is defined in the yelpsoa config for a service
31
+ instance then it will be used instead.
32
+ """
33
+ import logging
34
+ from typing import Dict
35
+ from typing import List
36
+ from typing import Optional
37
+ from typing import Union
38
+
39
+ from paasta_tools import eks_tools
40
+ from paasta_tools import kubernetes_tools
41
+ from paasta_tools import monitoring_tools
42
+ from paasta_tools.check_services_replication_tools import main
43
+ from paasta_tools.check_services_replication_tools import parse_args
44
+ from paasta_tools.eks_tools import EksDeploymentConfig
45
+ from paasta_tools.kubernetes_tools import is_pod_ready
46
+ from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
47
+ from paasta_tools.kubernetes_tools import V1Pod
48
+ from paasta_tools.long_running_service_tools import get_proxy_port_for_instance
49
+ from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
50
+
51
+
52
+ log = logging.getLogger(__name__)
53
+ DEFAULT_ALERT_AFTER = "10m"
54
+
55
+
56
+ def check_healthy_kubernetes_tasks_for_service_instance(
57
+ instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig],
58
+ expected_count: int,
59
+ pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
60
+ dry_run: bool = False,
61
+ ) -> None:
62
+ si_pods = pods_by_service_instance.get(instance_config.service, {}).get(
63
+ instance_config.instance, []
64
+ )
65
+
66
+ num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)])
67
+ log.info(
68
+ f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack"
69
+ )
70
+ monitoring_tools.send_replication_event_if_under_replication(
71
+ instance_config=instance_config,
72
+ expected_count=expected_count,
73
+ num_available=num_healthy_tasks,
74
+ dry_run=dry_run,
75
+ )
76
+
77
+
78
+ def check_kubernetes_pod_replication(
79
+ instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig],
80
+ pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
81
+ replication_checker: KubeSmartstackEnvoyReplicationChecker,
82
+ dry_run: bool = False,
83
+ ) -> Optional[bool]:
84
+ """Checks a service's replication levels based on how the service's replication
85
+ should be monitored. (smartstack/envoy or k8s)
86
+
87
+ :param instance_config: an instance of KubernetesDeploymentConfig or EksDeploymentConfig
88
+ :param replication_checker: an instance of KubeSmartstackEnvoyReplicationChecker
89
+ """
90
+ default_alert_after = DEFAULT_ALERT_AFTER
91
+ expected_count = instance_config.get_instances()
92
+ log.info(
93
+ "Expecting %d total tasks for %s" % (expected_count, instance_config.job_id)
94
+ )
95
+ proxy_port = get_proxy_port_for_instance(instance_config)
96
+
97
+ registrations = instance_config.get_registrations()
98
+
99
+ # If this instance does not autoscale and only has 1 instance, set alert after to 20m.
100
+ # Otherwise, set it to 10 min.
101
+ if (
102
+ not instance_config.is_autoscaling_enabled()
103
+ and instance_config.get_instances() == 1
104
+ ):
105
+ default_alert_after = "20m"
106
+ if "monitoring" not in instance_config.config_dict:
107
+ instance_config.config_dict["monitoring"] = {}
108
+ instance_config.config_dict["monitoring"][
109
+ "alert_after"
110
+ ] = instance_config.config_dict["monitoring"].get(
111
+ "alert_after", default_alert_after
112
+ )
113
+
114
+ # if the primary registration does not match the service_instance name then
115
+ # the best we can do is check k8s for replication (for now).
116
+ if proxy_port is not None and registrations[0] == instance_config.job_id:
117
+ is_well_replicated = monitoring_tools.check_replication_for_instance(
118
+ instance_config=instance_config,
119
+ expected_count=expected_count,
120
+ replication_checker=replication_checker,
121
+ dry_run=dry_run,
122
+ )
123
+ return is_well_replicated
124
+ else:
125
+ check_healthy_kubernetes_tasks_for_service_instance(
126
+ instance_config=instance_config,
127
+ expected_count=expected_count,
128
+ pods_by_service_instance=pods_by_service_instance,
129
+ dry_run=dry_run,
130
+ )
131
+ return None
132
+
133
+
134
+ if __name__ == "__main__":
135
+ args = parse_args()
136
+ main(
137
+ instance_type_class=eks_tools.EksDeploymentConfig
138
+ if args.eks
139
+ else kubernetes_tools.KubernetesDeploymentConfig,
140
+ check_service_replication=check_kubernetes_pod_replication,
141
+ )
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env python
2
+ # Copyright 2015-2016 Yelp Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import argparse
16
+ import json
17
+ import sys
18
+ import time
19
+
20
+ from pysensu_yelp import Status
21
+
22
+ from paasta_tools import monitoring_tools
23
+ from paasta_tools.cli.cmds.logs import scribe_env_to_locations
24
+ from paasta_tools.cli.utils import get_instance_config
25
+ from paasta_tools.utils import DEFAULT_SOA_DIR
26
+ from paasta_tools.utils import get_services_for_cluster
27
+ from paasta_tools.utils import load_system_paasta_config
28
+
29
+ try:
30
+ from scribereader import scribereader
31
+ from scribereader.clog.readers import StreamTailerSetupError
32
+ except ImportError:
33
+ scribereader = None
34
+
35
+
36
+ OOM_EVENTS_STREAM = "tmp_paasta_oom_events"
37
+
38
+
39
+ def compose_check_name_for_service_instance(check_name, service, instance):
40
+ return f"{check_name}.{service}.{instance}"
41
+
42
+
43
+ def parse_args(args):
44
+ parser = argparse.ArgumentParser(
45
+ description=(
46
+ "Check the %s stream and report to Sensu if"
47
+ " there are any OOM events." % OOM_EVENTS_STREAM
48
+ )
49
+ )
50
+ parser.add_argument(
51
+ "-d",
52
+ "--soa-dir",
53
+ dest="soa_dir",
54
+ default=DEFAULT_SOA_DIR,
55
+ help="define a different soa config directory",
56
+ )
57
+ parser.add_argument(
58
+ "-r",
59
+ "--realert-every",
60
+ dest="realert_every",
61
+ type=int,
62
+ default=1,
63
+ help="Sensu 'realert_every' to use.",
64
+ )
65
+ parser.add_argument(
66
+ "--check-interval",
67
+ dest="check_interval",
68
+ type=int,
69
+ default=1,
70
+ help="How often this check runs, in minutes.",
71
+ )
72
+ parser.add_argument(
73
+ "--alert-threshold",
74
+ dest="alert_threshold",
75
+ type=int,
76
+ default=1,
77
+ help="Number of OOM kills required in the check interval to send an alert.",
78
+ )
79
+ parser.add_argument(
80
+ "-s",
81
+ "--superregion",
82
+ dest="superregion",
83
+ required=True,
84
+ help="The superregion to read OOM events from.",
85
+ )
86
+ parser.add_argument(
87
+ "--dry-run",
88
+ dest="dry_run",
89
+ action="store_true",
90
+ help="Print Sensu alert events instead of sending them",
91
+ )
92
+ return parser.parse_args(args)
93
+
94
+
95
+ def read_oom_events_from_scribe(cluster, superregion, num_lines=1000):
96
+ """Read the latest 'num_lines' lines from OOM_EVENTS_STREAM and iterate over them."""
97
+ # paasta configs incls a map for cluster -> env that is expected by scribe
98
+ log_reader_config = load_system_paasta_config().get_log_reader()
99
+ cluster_map = log_reader_config["options"]["cluster_map"]
100
+ scribe_env = cluster_map[cluster]
101
+
102
+ # `scribe_env_to_locations` slightly mutates the scribe env based on whether
103
+ # or not it is in dev or prod
104
+ host, port = scribereader.get_tail_host_and_port(
105
+ **scribe_env_to_locations(scribe_env),
106
+ )
107
+ stream = scribereader.get_stream_tailer(
108
+ stream_name=OOM_EVENTS_STREAM,
109
+ tailing_host=host,
110
+ tailing_port=port,
111
+ lines=num_lines,
112
+ superregion=superregion,
113
+ )
114
+ try:
115
+ for line in stream:
116
+ try:
117
+ j = json.loads(line)
118
+ if j.get("cluster", "") == cluster:
119
+ yield j
120
+ except json.decoder.JSONDecodeError:
121
+ pass
122
+ except StreamTailerSetupError as e:
123
+ if "No data in stream" in str(e):
124
+ pass
125
+ else:
126
+ raise e
127
+
128
+
129
+ def latest_oom_events(cluster, superregion, interval=60):
130
+ """
131
+ :returns: {(service, instance): [OOMEvent, OOMEvent,...] }
132
+ if the number of events > 0
133
+ """
134
+ start_timestamp = int(time.time()) - interval
135
+ res = {}
136
+ for e in read_oom_events_from_scribe(cluster, superregion):
137
+ if e["timestamp"] > start_timestamp:
138
+ key = (e["service"], e["instance"])
139
+ res.setdefault(key, set()).add(e.get("container_id", ""))
140
+ return res
141
+
142
+
143
+ def compose_sensu_status(
144
+ instance, oom_events, is_check_enabled, alert_threshold, check_interval
145
+ ):
146
+ """
147
+ :param instance: InstanceConfig
148
+ :param oom_events: a list of OOMEvents
149
+ :param is_check_enabled: boolean to indicate whether the check enabled for the instance
150
+ """
151
+ interval_string = f"{check_interval} minute(s)"
152
+ instance_name = f"{instance.service}.{instance.instance}"
153
+ if not is_check_enabled:
154
+ return (Status.OK, f"This check is disabled for {instance_name}.")
155
+ if not oom_events:
156
+ return (
157
+ Status.OK,
158
+ f"No oom events for {instance_name} in the last {interval_string}.",
159
+ )
160
+ elif len(oom_events) >= alert_threshold:
161
+ return (
162
+ Status.CRITICAL,
163
+ f"The Out Of Memory killer killed processes for {instance_name} "
164
+ f"in the last {interval_string}.",
165
+ )
166
+ else:
167
+ # If the number of OOM kills isn't above the alert threshold,
168
+ # don't send anything. This will keep an alert open if it's already open,
169
+ # but won't start a new alert if there wasn't one yet
170
+ return None
171
+
172
+
173
+ def send_sensu_event(instance, oom_events, args):
174
+ """
175
+ :param instance: InstanceConfig
176
+ :param oom_events: a list of OOMEvents
177
+ """
178
+ check_name = compose_check_name_for_service_instance(
179
+ "oom-killer", instance.service, instance.instance
180
+ )
181
+ monitoring_overrides = instance.get_monitoring()
182
+ status = compose_sensu_status(
183
+ instance=instance,
184
+ oom_events=oom_events,
185
+ is_check_enabled=monitoring_overrides.get("check_oom_events", True),
186
+ alert_threshold=args.alert_threshold,
187
+ check_interval=args.check_interval,
188
+ )
189
+ if not status:
190
+ return
191
+
192
+ memory_limit = instance.get_mem()
193
+ try:
194
+ memory_limit_str = f"{int(memory_limit)}MB"
195
+ except ValueError:
196
+ memory_limit_str = memory_limit
197
+
198
+ monitoring_overrides.update(
199
+ {
200
+ "page": False,
201
+ "alert_after": "0m",
202
+ "realert_every": args.realert_every,
203
+ "runbook": "y/check-oom-events",
204
+ "tip": (
205
+ "Follow the runbook to investigate and rightsize memory usage "
206
+ f"(curr: {memory_limit_str})"
207
+ ),
208
+ }
209
+ )
210
+ return monitoring_tools.send_event(
211
+ service=instance.service,
212
+ check_name=check_name,
213
+ overrides=monitoring_overrides,
214
+ status=status[0],
215
+ output=status[1],
216
+ soa_dir=instance.soa_dir,
217
+ dry_run=args.dry_run,
218
+ )
219
+
220
+
221
+ def main(sys_argv):
222
+ args = parse_args(sys_argv[1:])
223
+ cluster = load_system_paasta_config().get_cluster()
224
+ victims = latest_oom_events(
225
+ cluster, args.superregion, interval=(60 * args.check_interval)
226
+ )
227
+
228
+ for (service, instance) in get_services_for_cluster(cluster, soa_dir=args.soa_dir):
229
+ try:
230
+ instance_config = get_instance_config(
231
+ service=service,
232
+ instance=instance,
233
+ cluster=cluster,
234
+ load_deployments=False,
235
+ soa_dir=args.soa_dir,
236
+ )
237
+ oom_events = victims.get((service, instance), [])
238
+ send_sensu_event(instance_config, oom_events, args)
239
+ except NotImplementedError: # When instance_type is not supported by get_instance_config
240
+ pass
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main(sys.argv)