paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2019 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_cassandracluster_services_replication.py [options]
|
|
17
|
+
"""
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from paasta_tools import cassandracluster_tools
|
|
21
|
+
from paasta_tools.check_kubernetes_services_replication import (
|
|
22
|
+
check_kubernetes_pod_replication,
|
|
23
|
+
)
|
|
24
|
+
from paasta_tools.check_services_replication_tools import main
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
log = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
main(
|
|
32
|
+
cassandracluster_tools.CassandraClusterDeploymentConfig,
|
|
33
|
+
check_kubernetes_pod_replication,
|
|
34
|
+
namespace="paasta-cassandraclusters",
|
|
35
|
+
)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2019 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_flink_services_health.py [options]
|
|
17
|
+
"""
|
|
18
|
+
import datetime
|
|
19
|
+
import logging
|
|
20
|
+
from typing import Dict
|
|
21
|
+
from typing import List
|
|
22
|
+
from typing import Sequence
|
|
23
|
+
from typing import Tuple
|
|
24
|
+
|
|
25
|
+
import pysensu_yelp
|
|
26
|
+
|
|
27
|
+
from paasta_tools import flink_tools
|
|
28
|
+
from paasta_tools import flinkeks_tools
|
|
29
|
+
from paasta_tools.check_services_replication_tools import main
|
|
30
|
+
from paasta_tools.check_services_replication_tools import parse_args
|
|
31
|
+
from paasta_tools.flink_tools import FlinkDeploymentConfig
|
|
32
|
+
from paasta_tools.kubernetes_tools import is_pod_ready
|
|
33
|
+
from paasta_tools.kubernetes_tools import V1Pod
|
|
34
|
+
from paasta_tools.monitoring_tools import check_under_replication
|
|
35
|
+
from paasta_tools.monitoring_tools import send_replication_event
|
|
36
|
+
from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
|
|
37
|
+
from paasta_tools.utils import is_under_replicated
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
log = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def container_lifetime(
|
|
44
|
+
pod: V1Pod,
|
|
45
|
+
) -> datetime.timedelta:
|
|
46
|
+
"""Return a time duration for how long the pod is alive"""
|
|
47
|
+
st = pod.status.start_time
|
|
48
|
+
return datetime.datetime.now(st.tzinfo) - st
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def healthy_flink_containers_cnt(si_pods: Sequence[V1Pod], container_type: str) -> int:
|
|
52
|
+
"""Return count of healthy Flink containers with given type"""
|
|
53
|
+
return len(
|
|
54
|
+
[
|
|
55
|
+
pod
|
|
56
|
+
for pod in si_pods
|
|
57
|
+
if pod.metadata.labels["flink.yelp.com/container-type"] == container_type
|
|
58
|
+
and is_pod_ready(pod)
|
|
59
|
+
and container_lifetime(pod).total_seconds() > 60
|
|
60
|
+
]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def check_under_registered_taskmanagers(
|
|
65
|
+
instance_config: FlinkDeploymentConfig,
|
|
66
|
+
expected_count: int,
|
|
67
|
+
cr_name: str,
|
|
68
|
+
is_eks: bool,
|
|
69
|
+
) -> Tuple[bool, str, str]:
|
|
70
|
+
"""Check if not enough taskmanagers have been registered to the jobmanager and
|
|
71
|
+
returns both the result of the check in the form of a boolean and a human-readable
|
|
72
|
+
text to be used in logging or monitoring events.
|
|
73
|
+
"""
|
|
74
|
+
unhealthy = True
|
|
75
|
+
if cr_name != "":
|
|
76
|
+
try:
|
|
77
|
+
overview = flink_tools.get_flink_jobmanager_overview(
|
|
78
|
+
cr_name, instance_config.cluster, is_eks
|
|
79
|
+
)
|
|
80
|
+
num_reported = overview.get("taskmanagers", 0)
|
|
81
|
+
crit_threshold = instance_config.get_replication_crit_percentage()
|
|
82
|
+
output = (
|
|
83
|
+
f"{instance_config.job_id} has {num_reported}/{expected_count} "
|
|
84
|
+
f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)"
|
|
85
|
+
)
|
|
86
|
+
unhealthy, _ = is_under_replicated(
|
|
87
|
+
num_reported, expected_count, crit_threshold
|
|
88
|
+
)
|
|
89
|
+
except ValueError as e:
|
|
90
|
+
output = (
|
|
91
|
+
f"Dashboard of service {instance_config.job_id} is not available ({e})"
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
output = f"Dashboard of service {instance_config.job_id} is not available"
|
|
95
|
+
if unhealthy:
|
|
96
|
+
description = f"""
|
|
97
|
+
This alert means that the Flink dashboard is not reporting the expected
|
|
98
|
+
number of taskmanagers.
|
|
99
|
+
|
|
100
|
+
Reasons this might be happening:
|
|
101
|
+
|
|
102
|
+
The service may simply be unhealthy. There also may not be enough resources
|
|
103
|
+
in the cluster to support the requested instance count.
|
|
104
|
+
|
|
105
|
+
Things you can do:
|
|
106
|
+
|
|
107
|
+
* Fix the cause of the unhealthy service. Try running:
|
|
108
|
+
|
|
109
|
+
paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
else:
|
|
113
|
+
description = f"{instance_config.job_id} taskmanager is available"
|
|
114
|
+
return unhealthy, output, description
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_cr_name(si_pods: Sequence[V1Pod]) -> str:
|
|
118
|
+
"""Returns the flink custom resource name based on the pod name. We are randomly choosing jobmanager pod here.
|
|
119
|
+
This change is related to FLINK-3129
|
|
120
|
+
"""
|
|
121
|
+
jobmanager_pod = [
|
|
122
|
+
pod
|
|
123
|
+
for pod in si_pods
|
|
124
|
+
if pod.metadata.labels["flink.yelp.com/container-type"] == "jobmanager"
|
|
125
|
+
and is_pod_ready(pod)
|
|
126
|
+
and container_lifetime(pod).total_seconds() > 60
|
|
127
|
+
]
|
|
128
|
+
if len(jobmanager_pod) == 1:
|
|
129
|
+
return jobmanager_pod[0].metadata.name.split("-jobmanager-")[0]
|
|
130
|
+
else:
|
|
131
|
+
return ""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def check_flink_service_health(
|
|
135
|
+
instance_config: FlinkDeploymentConfig,
|
|
136
|
+
pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
|
|
137
|
+
replication_checker: KubeSmartstackEnvoyReplicationChecker,
|
|
138
|
+
dry_run: bool = False,
|
|
139
|
+
) -> None:
|
|
140
|
+
si_pods = pods_by_service_instance.get(instance_config.service, {}).get(
|
|
141
|
+
instance_config.instance, []
|
|
142
|
+
)
|
|
143
|
+
taskmanagers_expected_cnt = instance_config.config_dict.get(
|
|
144
|
+
"taskmanager", {"instances": 10}
|
|
145
|
+
).get("instances", 10)
|
|
146
|
+
num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
|
|
147
|
+
num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
|
|
148
|
+
num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")
|
|
149
|
+
|
|
150
|
+
service_cr_name = get_cr_name(si_pods)
|
|
151
|
+
|
|
152
|
+
results = [
|
|
153
|
+
check_under_replication(
|
|
154
|
+
instance_config=instance_config,
|
|
155
|
+
expected_count=1,
|
|
156
|
+
num_available=num_healthy_supervisors,
|
|
157
|
+
sub_component="supervisor",
|
|
158
|
+
),
|
|
159
|
+
check_under_replication(
|
|
160
|
+
instance_config=instance_config,
|
|
161
|
+
expected_count=1,
|
|
162
|
+
num_available=num_healthy_jobmanagers,
|
|
163
|
+
sub_component="jobmanager",
|
|
164
|
+
),
|
|
165
|
+
check_under_replication(
|
|
166
|
+
instance_config=instance_config,
|
|
167
|
+
expected_count=taskmanagers_expected_cnt,
|
|
168
|
+
num_available=num_healthy_taskmanagers,
|
|
169
|
+
sub_component="taskmanager",
|
|
170
|
+
),
|
|
171
|
+
check_under_registered_taskmanagers(
|
|
172
|
+
instance_config=instance_config,
|
|
173
|
+
expected_count=taskmanagers_expected_cnt,
|
|
174
|
+
cr_name=service_cr_name,
|
|
175
|
+
is_eks=isinstance(instance_config, flinkeks_tools.FlinkEksDeploymentConfig),
|
|
176
|
+
),
|
|
177
|
+
]
|
|
178
|
+
output = ", ".join([r[1] for r in results])
|
|
179
|
+
description = "\n########\n".join([r[2] for r in results])
|
|
180
|
+
if any(r[0] for r in results):
|
|
181
|
+
log.error(output)
|
|
182
|
+
status = pysensu_yelp.Status.CRITICAL
|
|
183
|
+
else:
|
|
184
|
+
log.info(output)
|
|
185
|
+
status = pysensu_yelp.Status.OK
|
|
186
|
+
send_replication_event(
|
|
187
|
+
instance_config=instance_config,
|
|
188
|
+
status=status,
|
|
189
|
+
output=output,
|
|
190
|
+
description=description,
|
|
191
|
+
dry_run=dry_run,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
args = parse_args()
|
|
197
|
+
main(
|
|
198
|
+
instance_type_class=flinkeks_tools.FlinkEksDeploymentConfig
|
|
199
|
+
if args.eks
|
|
200
|
+
else flink_tools.FlinkDeploymentConfig,
|
|
201
|
+
check_service_replication=check_flink_service_health,
|
|
202
|
+
namespace="paasta-flinks",
|
|
203
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/opt/venvs/paasta-tools/bin/python
|
|
2
|
+
# Copyright 2015-2016 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_kubernetes_api.py [options]
|
|
17
|
+
|
|
18
|
+
This is a script that checks connectivity and credentials for Kubernetes API.
|
|
19
|
+
"""
|
|
20
|
+
import argparse
|
|
21
|
+
import logging
|
|
22
|
+
import sys
|
|
23
|
+
|
|
24
|
+
from paasta_tools.kubernetes_tools import KubeClient
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
log = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def parse_args():
|
|
31
|
+
parser = argparse.ArgumentParser()
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"-v", "--verbose", action="store_true", dest="verbose", default=False
|
|
34
|
+
)
|
|
35
|
+
options = parser.parse_args()
|
|
36
|
+
return options
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main() -> None:
|
|
40
|
+
args = parse_args()
|
|
41
|
+
if args.verbose:
|
|
42
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
43
|
+
else:
|
|
44
|
+
logging.basicConfig(level=logging.WARNING)
|
|
45
|
+
|
|
46
|
+
kube_client = KubeClient()
|
|
47
|
+
try:
|
|
48
|
+
kube_client.core.list_namespace()
|
|
49
|
+
log.info("API is ok")
|
|
50
|
+
sys.exit(0)
|
|
51
|
+
except Exception as exc:
|
|
52
|
+
log.error(f"Error connecting to API: {exc}")
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
main()
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2019 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./check_kubernetes_services_replication.py [options]
|
|
17
|
+
|
|
18
|
+
This is a script that checks the number of HAProxy backends via Synapse against
|
|
19
|
+
the expected amount that should've been deployed via Kubernetes.
|
|
20
|
+
|
|
21
|
+
Basically, the script checks smartstack.yaml for listed namespaces, and then queries
|
|
22
|
+
Synapse for the number of available backends for that namespace. It then goes through
|
|
23
|
+
the Kubernetes service configuration file for that cluster, and sees how many instances
|
|
24
|
+
are expected to be available for that namespace based on the number of instances deployed
|
|
25
|
+
on that namespace.
|
|
26
|
+
|
|
27
|
+
After retrieving that information, a fraction of available instances is calculated
|
|
28
|
+
(available/expected), and then compared against a threshold. The default threshold is
|
|
29
|
+
50, meaning if less than 50% of a service's backends are available, the script sends
|
|
30
|
+
CRITICAL. If replication_threshold is defined in the yelpsoa config for a service
|
|
31
|
+
instance then it will be used instead.
|
|
32
|
+
"""
|
|
33
|
+
import logging
|
|
34
|
+
from typing import Dict
|
|
35
|
+
from typing import List
|
|
36
|
+
from typing import Optional
|
|
37
|
+
from typing import Union
|
|
38
|
+
|
|
39
|
+
from paasta_tools import eks_tools
|
|
40
|
+
from paasta_tools import kubernetes_tools
|
|
41
|
+
from paasta_tools import monitoring_tools
|
|
42
|
+
from paasta_tools.check_services_replication_tools import main
|
|
43
|
+
from paasta_tools.check_services_replication_tools import parse_args
|
|
44
|
+
from paasta_tools.eks_tools import EksDeploymentConfig
|
|
45
|
+
from paasta_tools.kubernetes_tools import is_pod_ready
|
|
46
|
+
from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig
|
|
47
|
+
from paasta_tools.kubernetes_tools import V1Pod
|
|
48
|
+
from paasta_tools.long_running_service_tools import get_proxy_port_for_instance
|
|
49
|
+
from paasta_tools.smartstack_tools import KubeSmartstackEnvoyReplicationChecker
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
log = logging.getLogger(__name__)
|
|
53
|
+
DEFAULT_ALERT_AFTER = "10m"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def check_healthy_kubernetes_tasks_for_service_instance(
|
|
57
|
+
instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig],
|
|
58
|
+
expected_count: int,
|
|
59
|
+
pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
|
|
60
|
+
dry_run: bool = False,
|
|
61
|
+
) -> None:
|
|
62
|
+
si_pods = pods_by_service_instance.get(instance_config.service, {}).get(
|
|
63
|
+
instance_config.instance, []
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)])
|
|
67
|
+
log.info(
|
|
68
|
+
f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack"
|
|
69
|
+
)
|
|
70
|
+
monitoring_tools.send_replication_event_if_under_replication(
|
|
71
|
+
instance_config=instance_config,
|
|
72
|
+
expected_count=expected_count,
|
|
73
|
+
num_available=num_healthy_tasks,
|
|
74
|
+
dry_run=dry_run,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def check_kubernetes_pod_replication(
|
|
79
|
+
instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig],
|
|
80
|
+
pods_by_service_instance: Dict[str, Dict[str, List[V1Pod]]],
|
|
81
|
+
replication_checker: KubeSmartstackEnvoyReplicationChecker,
|
|
82
|
+
dry_run: bool = False,
|
|
83
|
+
) -> Optional[bool]:
|
|
84
|
+
"""Checks a service's replication levels based on how the service's replication
|
|
85
|
+
should be monitored. (smartstack/envoy or k8s)
|
|
86
|
+
|
|
87
|
+
:param instance_config: an instance of KubernetesDeploymentConfig or EksDeploymentConfig
|
|
88
|
+
:param replication_checker: an instance of KubeSmartstackEnvoyReplicationChecker
|
|
89
|
+
"""
|
|
90
|
+
default_alert_after = DEFAULT_ALERT_AFTER
|
|
91
|
+
expected_count = instance_config.get_instances()
|
|
92
|
+
log.info(
|
|
93
|
+
"Expecting %d total tasks for %s" % (expected_count, instance_config.job_id)
|
|
94
|
+
)
|
|
95
|
+
proxy_port = get_proxy_port_for_instance(instance_config)
|
|
96
|
+
|
|
97
|
+
registrations = instance_config.get_registrations()
|
|
98
|
+
|
|
99
|
+
# If this instance does not autoscale and only has 1 instance, set alert after to 20m.
|
|
100
|
+
# Otherwise, set it to 10 min.
|
|
101
|
+
if (
|
|
102
|
+
not instance_config.is_autoscaling_enabled()
|
|
103
|
+
and instance_config.get_instances() == 1
|
|
104
|
+
):
|
|
105
|
+
default_alert_after = "20m"
|
|
106
|
+
if "monitoring" not in instance_config.config_dict:
|
|
107
|
+
instance_config.config_dict["monitoring"] = {}
|
|
108
|
+
instance_config.config_dict["monitoring"][
|
|
109
|
+
"alert_after"
|
|
110
|
+
] = instance_config.config_dict["monitoring"].get(
|
|
111
|
+
"alert_after", default_alert_after
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# if the primary registration does not match the service_instance name then
|
|
115
|
+
# the best we can do is check k8s for replication (for now).
|
|
116
|
+
if proxy_port is not None and registrations[0] == instance_config.job_id:
|
|
117
|
+
is_well_replicated = monitoring_tools.check_replication_for_instance(
|
|
118
|
+
instance_config=instance_config,
|
|
119
|
+
expected_count=expected_count,
|
|
120
|
+
replication_checker=replication_checker,
|
|
121
|
+
dry_run=dry_run,
|
|
122
|
+
)
|
|
123
|
+
return is_well_replicated
|
|
124
|
+
else:
|
|
125
|
+
check_healthy_kubernetes_tasks_for_service_instance(
|
|
126
|
+
instance_config=instance_config,
|
|
127
|
+
expected_count=expected_count,
|
|
128
|
+
pods_by_service_instance=pods_by_service_instance,
|
|
129
|
+
dry_run=dry_run,
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
args = parse_args()
|
|
136
|
+
main(
|
|
137
|
+
instance_type_class=eks_tools.EksDeploymentConfig
|
|
138
|
+
if args.eks
|
|
139
|
+
else kubernetes_tools.KubernetesDeploymentConfig,
|
|
140
|
+
check_service_replication=check_kubernetes_pod_replication,
|
|
141
|
+
)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# Copyright 2015-2016 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import sys
|
|
18
|
+
import time
|
|
19
|
+
|
|
20
|
+
from pysensu_yelp import Status
|
|
21
|
+
|
|
22
|
+
from paasta_tools import monitoring_tools
|
|
23
|
+
from paasta_tools.cli.cmds.logs import scribe_env_to_locations
|
|
24
|
+
from paasta_tools.cli.utils import get_instance_config
|
|
25
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
26
|
+
from paasta_tools.utils import get_services_for_cluster
|
|
27
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from scribereader import scribereader
|
|
31
|
+
from scribereader.clog.readers import StreamTailerSetupError
|
|
32
|
+
except ImportError:
|
|
33
|
+
scribereader = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
OOM_EVENTS_STREAM = "tmp_paasta_oom_events"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def compose_check_name_for_service_instance(check_name, service, instance):
|
|
40
|
+
return f"{check_name}.{service}.{instance}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_args(args):
|
|
44
|
+
parser = argparse.ArgumentParser(
|
|
45
|
+
description=(
|
|
46
|
+
"Check the %s stream and report to Sensu if"
|
|
47
|
+
" there are any OOM events." % OOM_EVENTS_STREAM
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-d",
|
|
52
|
+
"--soa-dir",
|
|
53
|
+
dest="soa_dir",
|
|
54
|
+
default=DEFAULT_SOA_DIR,
|
|
55
|
+
help="define a different soa config directory",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
"-r",
|
|
59
|
+
"--realert-every",
|
|
60
|
+
dest="realert_every",
|
|
61
|
+
type=int,
|
|
62
|
+
default=1,
|
|
63
|
+
help="Sensu 'realert_every' to use.",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--check-interval",
|
|
67
|
+
dest="check_interval",
|
|
68
|
+
type=int,
|
|
69
|
+
default=1,
|
|
70
|
+
help="How often this check runs, in minutes.",
|
|
71
|
+
)
|
|
72
|
+
parser.add_argument(
|
|
73
|
+
"--alert-threshold",
|
|
74
|
+
dest="alert_threshold",
|
|
75
|
+
type=int,
|
|
76
|
+
default=1,
|
|
77
|
+
help="Number of OOM kills required in the check interval to send an alert.",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"-s",
|
|
81
|
+
"--superregion",
|
|
82
|
+
dest="superregion",
|
|
83
|
+
required=True,
|
|
84
|
+
help="The superregion to read OOM events from.",
|
|
85
|
+
)
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
"--dry-run",
|
|
88
|
+
dest="dry_run",
|
|
89
|
+
action="store_true",
|
|
90
|
+
help="Print Sensu alert events instead of sending them",
|
|
91
|
+
)
|
|
92
|
+
return parser.parse_args(args)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def read_oom_events_from_scribe(cluster, superregion, num_lines=1000):
|
|
96
|
+
"""Read the latest 'num_lines' lines from OOM_EVENTS_STREAM and iterate over them."""
|
|
97
|
+
# paasta configs incls a map for cluster -> env that is expected by scribe
|
|
98
|
+
log_reader_config = load_system_paasta_config().get_log_reader()
|
|
99
|
+
cluster_map = log_reader_config["options"]["cluster_map"]
|
|
100
|
+
scribe_env = cluster_map[cluster]
|
|
101
|
+
|
|
102
|
+
# `scribe_env_to_locations` slightly mutates the scribe env based on whether
|
|
103
|
+
# or not it is in dev or prod
|
|
104
|
+
host, port = scribereader.get_tail_host_and_port(
|
|
105
|
+
**scribe_env_to_locations(scribe_env),
|
|
106
|
+
)
|
|
107
|
+
stream = scribereader.get_stream_tailer(
|
|
108
|
+
stream_name=OOM_EVENTS_STREAM,
|
|
109
|
+
tailing_host=host,
|
|
110
|
+
tailing_port=port,
|
|
111
|
+
lines=num_lines,
|
|
112
|
+
superregion=superregion,
|
|
113
|
+
)
|
|
114
|
+
try:
|
|
115
|
+
for line in stream:
|
|
116
|
+
try:
|
|
117
|
+
j = json.loads(line)
|
|
118
|
+
if j.get("cluster", "") == cluster:
|
|
119
|
+
yield j
|
|
120
|
+
except json.decoder.JSONDecodeError:
|
|
121
|
+
pass
|
|
122
|
+
except StreamTailerSetupError as e:
|
|
123
|
+
if "No data in stream" in str(e):
|
|
124
|
+
pass
|
|
125
|
+
else:
|
|
126
|
+
raise e
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def latest_oom_events(cluster, superregion, interval=60):
|
|
130
|
+
"""
|
|
131
|
+
:returns: {(service, instance): [OOMEvent, OOMEvent,...] }
|
|
132
|
+
if the number of events > 0
|
|
133
|
+
"""
|
|
134
|
+
start_timestamp = int(time.time()) - interval
|
|
135
|
+
res = {}
|
|
136
|
+
for e in read_oom_events_from_scribe(cluster, superregion):
|
|
137
|
+
if e["timestamp"] > start_timestamp:
|
|
138
|
+
key = (e["service"], e["instance"])
|
|
139
|
+
res.setdefault(key, set()).add(e.get("container_id", ""))
|
|
140
|
+
return res
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def compose_sensu_status(
|
|
144
|
+
instance, oom_events, is_check_enabled, alert_threshold, check_interval
|
|
145
|
+
):
|
|
146
|
+
"""
|
|
147
|
+
:param instance: InstanceConfig
|
|
148
|
+
:param oom_events: a list of OOMEvents
|
|
149
|
+
:param is_check_enabled: boolean to indicate whether the check enabled for the instance
|
|
150
|
+
"""
|
|
151
|
+
interval_string = f"{check_interval} minute(s)"
|
|
152
|
+
instance_name = f"{instance.service}.{instance.instance}"
|
|
153
|
+
if not is_check_enabled:
|
|
154
|
+
return (Status.OK, f"This check is disabled for {instance_name}.")
|
|
155
|
+
if not oom_events:
|
|
156
|
+
return (
|
|
157
|
+
Status.OK,
|
|
158
|
+
f"No oom events for {instance_name} in the last {interval_string}.",
|
|
159
|
+
)
|
|
160
|
+
elif len(oom_events) >= alert_threshold:
|
|
161
|
+
return (
|
|
162
|
+
Status.CRITICAL,
|
|
163
|
+
f"The Out Of Memory killer killed processes for {instance_name} "
|
|
164
|
+
f"in the last {interval_string}.",
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
# If the number of OOM kills isn't above the alert threshold,
|
|
168
|
+
# don't send anything. This will keep an alert open if it's already open,
|
|
169
|
+
# but won't start a new alert if there wasn't one yet
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def send_sensu_event(instance, oom_events, args):
|
|
174
|
+
"""
|
|
175
|
+
:param instance: InstanceConfig
|
|
176
|
+
:param oom_events: a list of OOMEvents
|
|
177
|
+
"""
|
|
178
|
+
check_name = compose_check_name_for_service_instance(
|
|
179
|
+
"oom-killer", instance.service, instance.instance
|
|
180
|
+
)
|
|
181
|
+
monitoring_overrides = instance.get_monitoring()
|
|
182
|
+
status = compose_sensu_status(
|
|
183
|
+
instance=instance,
|
|
184
|
+
oom_events=oom_events,
|
|
185
|
+
is_check_enabled=monitoring_overrides.get("check_oom_events", True),
|
|
186
|
+
alert_threshold=args.alert_threshold,
|
|
187
|
+
check_interval=args.check_interval,
|
|
188
|
+
)
|
|
189
|
+
if not status:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
memory_limit = instance.get_mem()
|
|
193
|
+
try:
|
|
194
|
+
memory_limit_str = f"{int(memory_limit)}MB"
|
|
195
|
+
except ValueError:
|
|
196
|
+
memory_limit_str = memory_limit
|
|
197
|
+
|
|
198
|
+
monitoring_overrides.update(
|
|
199
|
+
{
|
|
200
|
+
"page": False,
|
|
201
|
+
"alert_after": "0m",
|
|
202
|
+
"realert_every": args.realert_every,
|
|
203
|
+
"runbook": "y/check-oom-events",
|
|
204
|
+
"tip": (
|
|
205
|
+
"Follow the runbook to investigate and rightsize memory usage "
|
|
206
|
+
f"(curr: {memory_limit_str})"
|
|
207
|
+
),
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
return monitoring_tools.send_event(
|
|
211
|
+
service=instance.service,
|
|
212
|
+
check_name=check_name,
|
|
213
|
+
overrides=monitoring_overrides,
|
|
214
|
+
status=status[0],
|
|
215
|
+
output=status[1],
|
|
216
|
+
soa_dir=instance.soa_dir,
|
|
217
|
+
dry_run=args.dry_run,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def main(sys_argv):
|
|
222
|
+
args = parse_args(sys_argv[1:])
|
|
223
|
+
cluster = load_system_paasta_config().get_cluster()
|
|
224
|
+
victims = latest_oom_events(
|
|
225
|
+
cluster, args.superregion, interval=(60 * args.check_interval)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
for (service, instance) in get_services_for_cluster(cluster, soa_dir=args.soa_dir):
|
|
229
|
+
try:
|
|
230
|
+
instance_config = get_instance_config(
|
|
231
|
+
service=service,
|
|
232
|
+
instance=instance,
|
|
233
|
+
cluster=cluster,
|
|
234
|
+
load_deployments=False,
|
|
235
|
+
soa_dir=args.soa_dir,
|
|
236
|
+
)
|
|
237
|
+
oom_events = victims.get((service, instance), [])
|
|
238
|
+
send_sensu_event(instance_config, oom_events, args)
|
|
239
|
+
except NotImplementedError: # When instance_type is not supported by get_instance_config
|
|
240
|
+
pass
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
if __name__ == "__main__":
|
|
244
|
+
main(sys.argv)
|