paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import argparse
|
|
3
|
+
import asyncio
|
|
4
|
+
import functools
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os.path
|
|
8
|
+
import re
|
|
9
|
+
import socket
|
|
10
|
+
import sys
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any
|
|
14
|
+
from typing import DefaultDict
|
|
15
|
+
from typing import Dict
|
|
16
|
+
from typing import List
|
|
17
|
+
from typing import NamedTuple
|
|
18
|
+
from typing import Optional
|
|
19
|
+
from typing import Set
|
|
20
|
+
from typing import Tuple
|
|
21
|
+
|
|
22
|
+
from kazoo.client import KazooClient
|
|
23
|
+
from kazoo.exceptions import NoNodeError
|
|
24
|
+
|
|
25
|
+
from paasta_tools import yaml_tools as yaml
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("check_orphans")
|
|
28
|
+
|
|
29
|
+
PREFIX = "/smartstack/global/"
|
|
30
|
+
CHUNK_SIZE = 50 # How many concurrent xinetd connections
|
|
31
|
+
DEFAULT_ZK_DISCOVERY_PATH = "/nail/etc/zookeeper_discovery/infrastructure/local.yaml"
|
|
32
|
+
DEFAULT_NERVE_XINETD_PORT = 8735
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ExitCode(Enum):
|
|
36
|
+
OK = 0
|
|
37
|
+
ORPHANS = 1
|
|
38
|
+
COLLISIONS = 2
|
|
39
|
+
UNKNOWN = 3
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_zk_hosts(path: str) -> List[str]:
|
|
43
|
+
with open(path) as f:
|
|
44
|
+
x = yaml.safe_load(f)
|
|
45
|
+
return [f"{host}:{port}" for host, port in x]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
SmartstackData = Dict[str, Dict[str, Any]]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_zk_data(ignored_services: Set[str]) -> SmartstackData:
|
|
52
|
+
logger.info(f"using {DEFAULT_ZK_DISCOVERY_PATH} for zookeeper")
|
|
53
|
+
zk_hosts = get_zk_hosts(DEFAULT_ZK_DISCOVERY_PATH)
|
|
54
|
+
|
|
55
|
+
logger.debug(f"connecting to zk hosts {zk_hosts}")
|
|
56
|
+
zk = KazooClient(hosts=zk_hosts)
|
|
57
|
+
zk.start()
|
|
58
|
+
|
|
59
|
+
logger.debug(f"pulling smartstack data from zookeeper")
|
|
60
|
+
zk_data = {}
|
|
61
|
+
services = zk.get_children(PREFIX)
|
|
62
|
+
for service in services:
|
|
63
|
+
if service in ignored_services:
|
|
64
|
+
continue
|
|
65
|
+
service_instances = zk.get_children(os.path.join(PREFIX, service))
|
|
66
|
+
instances_data = {}
|
|
67
|
+
for instance in service_instances:
|
|
68
|
+
try:
|
|
69
|
+
instance_node = zk.get(os.path.join(PREFIX, service, instance))
|
|
70
|
+
except NoNodeError:
|
|
71
|
+
continue
|
|
72
|
+
instances_data[instance] = json.loads(instance_node[0])
|
|
73
|
+
zk_data[service] = instances_data
|
|
74
|
+
|
|
75
|
+
return zk_data
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class InstanceTuple(NamedTuple):
|
|
79
|
+
# paasta_host may be different from the service's host if running on k8s.
|
|
80
|
+
# We need the actual PaaSTA host because the k8s pod does not listen for
|
|
81
|
+
# xinetd connections.
|
|
82
|
+
paasta_host: str
|
|
83
|
+
host: str
|
|
84
|
+
port: int
|
|
85
|
+
service: str
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def read_from_zk_data(registrations: SmartstackData) -> Set[InstanceTuple]:
|
|
89
|
+
return {
|
|
90
|
+
InstanceTuple(
|
|
91
|
+
host_to_ip(instance_data["name"], instance_data["host"]),
|
|
92
|
+
instance_data["host"],
|
|
93
|
+
instance_data["port"],
|
|
94
|
+
service,
|
|
95
|
+
)
|
|
96
|
+
for service, instance in registrations.items()
|
|
97
|
+
for instance_data in instance.values()
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@functools.lru_cache()
|
|
102
|
+
def host_to_ip(host: str, fallback: str) -> str:
|
|
103
|
+
"""Try to resolve a host to an IP with a fallback.
|
|
104
|
+
|
|
105
|
+
Because DNS resolution is relatively slow and can't be easily performed
|
|
106
|
+
using asyncio, we cheat a little and use a regex for well-formed hostnames
|
|
107
|
+
to try to guess the IP without doing real resolution.
|
|
108
|
+
|
|
109
|
+
A fallback is needed because in some cases the nerve registration does not
|
|
110
|
+
match an actual hostname (e.g. "prod-db15" or "prod-splunk-master").
|
|
111
|
+
"""
|
|
112
|
+
for match in (
|
|
113
|
+
re.match(r"^(\d+)-(\d+)-(\d+)-(\d+)-", host),
|
|
114
|
+
re.match(r"^ip-(\d+)-(\d+)-(\d+)-(\d+)", host),
|
|
115
|
+
):
|
|
116
|
+
if match:
|
|
117
|
+
return ".".join(match.groups())
|
|
118
|
+
else:
|
|
119
|
+
try:
|
|
120
|
+
return socket.gethostbyname(host)
|
|
121
|
+
except socket.gaierror:
|
|
122
|
+
return fallback
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def transfer_one_file(
|
|
126
|
+
host: str, port: int = DEFAULT_NERVE_XINETD_PORT
|
|
127
|
+
) -> Tuple[str, Optional[str]]:
|
|
128
|
+
logger.debug(f"getting file from {host}")
|
|
129
|
+
try:
|
|
130
|
+
reader, _ = await asyncio.wait_for(
|
|
131
|
+
asyncio.open_connection(host=host, port=port, limit=2**32), timeout=1.0
|
|
132
|
+
)
|
|
133
|
+
resp = await asyncio.wait_for(reader.read(), timeout=1.0)
|
|
134
|
+
except (asyncio.TimeoutError, ConnectionRefusedError) as ex:
|
|
135
|
+
# this is not ununusual because we sometimes advertise hosts from
|
|
136
|
+
# firewalled subnets where we can't make this connection to get
|
|
137
|
+
# the file. check y/ipam to see what the subnet means
|
|
138
|
+
logger.debug(f"error getting file from {host}: {ex!r}")
|
|
139
|
+
return (host, None)
|
|
140
|
+
|
|
141
|
+
return (host, resp.decode())
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
async def gather_files(hosts: Set[str]) -> Dict[str, str]:
|
|
145
|
+
logger.info("gathering files from {} hosts".format(len(hosts)))
|
|
146
|
+
tasks = [transfer_one_file(host) for host in hosts]
|
|
147
|
+
responses = {}
|
|
148
|
+
for idx in range(0, len(tasks), CHUNK_SIZE):
|
|
149
|
+
resp = await asyncio.gather(
|
|
150
|
+
*tasks[idx : idx + CHUNK_SIZE], return_exceptions=True
|
|
151
|
+
)
|
|
152
|
+
responses.update(dict(resp))
|
|
153
|
+
return responses
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def read_one_nerve_file(nerve_config: str) -> Set[InstanceTuple]:
|
|
157
|
+
nerve_config = json.loads(nerve_config)
|
|
158
|
+
return {
|
|
159
|
+
InstanceTuple(
|
|
160
|
+
# The "instance_id" configured in nerve's config file is the same
|
|
161
|
+
# as the "name" attribute in a zookeeper registration (i.e. for
|
|
162
|
+
# PaaSTA hosts, it will be the hostname of the machine running
|
|
163
|
+
# nerve). To be able to easily compare the tuples using set
|
|
164
|
+
# operations, we resolve it to an IP in both places.
|
|
165
|
+
host_to_ip(nerve_config["instance_id"], service["host"]),
|
|
166
|
+
service["host"],
|
|
167
|
+
service["port"],
|
|
168
|
+
service["zk_path"][len(PREFIX) :],
|
|
169
|
+
)
|
|
170
|
+
for service in nerve_config["services"].values()
|
|
171
|
+
if service["zk_path"].startswith(PREFIX)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def read_nerve_files(
|
|
176
|
+
nerve_configs: Dict[str, Optional[str]]
|
|
177
|
+
) -> Tuple[Set[InstanceTuple], Set[str]]:
|
|
178
|
+
instance_set: Set[InstanceTuple] = set()
|
|
179
|
+
not_found_hosts: Set[str] = set()
|
|
180
|
+
for host, host_config in nerve_configs.items():
|
|
181
|
+
if host_config is None:
|
|
182
|
+
not_found_hosts.add(host)
|
|
183
|
+
else:
|
|
184
|
+
instance_set |= read_one_nerve_file(host_config)
|
|
185
|
+
return instance_set, not_found_hosts
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_instance_data(
|
|
189
|
+
ignored_services: Set[str],
|
|
190
|
+
) -> Tuple[Set[InstanceTuple], Set[InstanceTuple]]:
|
|
191
|
+
# Dump ZK
|
|
192
|
+
zk_data = get_zk_data(ignored_services)
|
|
193
|
+
zk_instance_data = read_from_zk_data(zk_data)
|
|
194
|
+
|
|
195
|
+
hosts = {x[0] for x in zk_instance_data}
|
|
196
|
+
|
|
197
|
+
# Dump Nerve configs from each host via xinetd
|
|
198
|
+
results = asyncio.get_event_loop().run_until_complete(gather_files(hosts))
|
|
199
|
+
|
|
200
|
+
nerve_instance_data, not_found_hosts = read_nerve_files(results)
|
|
201
|
+
|
|
202
|
+
# Filter out anything that we couldn't get a nerve config for
|
|
203
|
+
zk_instance_data_filtered = {
|
|
204
|
+
x for x in zk_instance_data if x[0] not in not_found_hosts
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
logger.info("zk_instance_data (unfiltered) len: {}".format(len(zk_instance_data)))
|
|
208
|
+
logger.info(
|
|
209
|
+
"zk_instance_data (filtered) len: {}".format(len(zk_instance_data_filtered))
|
|
210
|
+
)
|
|
211
|
+
logger.info("nerve_instance_data len: {}".format(len(nerve_instance_data)))
|
|
212
|
+
|
|
213
|
+
return zk_instance_data_filtered, nerve_instance_data
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def check_orphans(
|
|
217
|
+
zk_instance_data: Set[InstanceTuple],
|
|
218
|
+
nerve_instance_data: Set[InstanceTuple],
|
|
219
|
+
check_orphans: bool,
|
|
220
|
+
check_collisions: bool,
|
|
221
|
+
) -> ExitCode:
|
|
222
|
+
|
|
223
|
+
if check_collisions:
|
|
224
|
+
# collisions
|
|
225
|
+
instance_by_addr: DefaultDict[Tuple[str, int], Set[str]] = defaultdict(set)
|
|
226
|
+
for nerve_inst in nerve_instance_data:
|
|
227
|
+
instance_by_addr[(nerve_inst.host, nerve_inst.port)].add(nerve_inst.service)
|
|
228
|
+
collisions: List[str] = []
|
|
229
|
+
for zk_inst in zk_instance_data:
|
|
230
|
+
nerve_services = instance_by_addr[(zk_inst.host, zk_inst.port)]
|
|
231
|
+
if len(nerve_services) >= 1 and zk_inst.service not in nerve_services:
|
|
232
|
+
collisions.append(
|
|
233
|
+
f"[{zk_inst.host}:{zk_inst.port}] {zk_inst.service} collides with {nerve_services}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if collisions:
|
|
237
|
+
logger.warning("Collisions found! Traffic is being misrouted!")
|
|
238
|
+
print("\n".join(collisions))
|
|
239
|
+
return ExitCode.COLLISIONS
|
|
240
|
+
else:
|
|
241
|
+
logger.info(
|
|
242
|
+
f"No collisions found out of {len(zk_instance_data)} service registrations seen."
|
|
243
|
+
)
|
|
244
|
+
if check_orphans:
|
|
245
|
+
orphans = zk_instance_data - nerve_instance_data
|
|
246
|
+
|
|
247
|
+
# groupby host
|
|
248
|
+
orphans_by_host: DefaultDict[str, List[Tuple[int, str]]] = defaultdict(list)
|
|
249
|
+
for orphan in orphans:
|
|
250
|
+
orphans_by_host[orphan.host].append((orphan.port, orphan.service))
|
|
251
|
+
|
|
252
|
+
if orphans:
|
|
253
|
+
logger.warning("{} orphans found".format(len(orphans)))
|
|
254
|
+
print(dict(orphans_by_host))
|
|
255
|
+
return ExitCode.ORPHANS
|
|
256
|
+
else:
|
|
257
|
+
logger.info(
|
|
258
|
+
f"No orphans found out of {len(zk_instance_data)} service registrations seen."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return ExitCode.OK
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def main() -> ExitCode:
|
|
265
|
+
logging.basicConfig(level=logging.WARNING)
|
|
266
|
+
parser = argparse.ArgumentParser()
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--ignored-services",
|
|
269
|
+
# TODO(ckuehl|2020-08-27): Remove this deprecated option alias eventually.
|
|
270
|
+
"--blacklisted-services-DEPRECATED",
|
|
271
|
+
default="",
|
|
272
|
+
type=str,
|
|
273
|
+
help="Comma separated list of services to ignore",
|
|
274
|
+
)
|
|
275
|
+
parser.add_argument(
|
|
276
|
+
"--no-check-collisions",
|
|
277
|
+
default=False,
|
|
278
|
+
action="store_true",
|
|
279
|
+
help="Skip checking collisions",
|
|
280
|
+
)
|
|
281
|
+
parser.add_argument(
|
|
282
|
+
"--no-check-orphans",
|
|
283
|
+
default=False,
|
|
284
|
+
action="store_true",
|
|
285
|
+
help="Skip checking orphans",
|
|
286
|
+
)
|
|
287
|
+
args = parser.parse_args()
|
|
288
|
+
|
|
289
|
+
if args.no_check_collisions and args.no_check_orphans:
|
|
290
|
+
logger.error("Must check at least one of orphans or collisions.")
|
|
291
|
+
return ExitCode.UNKNOWN
|
|
292
|
+
|
|
293
|
+
zk_instance_data, nerve_instance_data = get_instance_data(
|
|
294
|
+
set(args.ignored_services.split(","))
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return check_orphans(
|
|
298
|
+
zk_instance_data,
|
|
299
|
+
nerve_instance_data,
|
|
300
|
+
check_orphans=not args.no_check_orphans,
|
|
301
|
+
check_collisions=not args.no_check_collisions,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
sys.exit(main().value)
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
import argparse
|
|
3
|
+
import datetime
|
|
4
|
+
import logging
|
|
5
|
+
import smtplib
|
|
6
|
+
import sys
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from email.message import EmailMessage
|
|
9
|
+
from socket import getfqdn
|
|
10
|
+
|
|
11
|
+
import pysensu_yelp
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
from paasta_tools import mesos_tools
|
|
15
|
+
from paasta_tools.monitoring_tools import send_event
|
|
16
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
17
|
+
from paasta_tools.utils import list_services
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
email_from_address = f"paasta@{getfqdn()}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
JUPYTER_PREFIX = "jupyterhub_"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_args():
|
|
28
|
+
parser = argparse.ArgumentParser(
|
|
29
|
+
description="Reports long-running Spark frameworks."
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--min-hours",
|
|
33
|
+
type=float,
|
|
34
|
+
help="Report frameworks that have been registered for more than this duration",
|
|
35
|
+
default=0,
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--no-notify",
|
|
39
|
+
action="store_true",
|
|
40
|
+
help="Skip notifying the teams that own each framework",
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--email-domain", default=None, help="Email domain for notifying users"
|
|
44
|
+
)
|
|
45
|
+
return parser.parse_args()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_time_running(framework):
|
|
49
|
+
registered_time = datetime.datetime.fromtimestamp(framework["registered_time"])
|
|
50
|
+
return datetime.datetime.now() - registered_time
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_spark_properties(framework):
|
|
54
|
+
webui_url = framework.get("webui_url")
|
|
55
|
+
if not webui_url:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
env_endpoint = f"{webui_url}/api/v1/applications/{framework.id}/environment"
|
|
59
|
+
try:
|
|
60
|
+
response = requests.get(env_endpoint, timeout=5)
|
|
61
|
+
except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
|
|
62
|
+
logger.warning(f"Unable to connect to {env_endpoint}: {e!r}")
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
if response.status_code != 200:
|
|
66
|
+
logger.warning(f"Bad response from {env_endpoint}: {response.status_code}")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
return response.json()["sparkProperties"]
|
|
71
|
+
except (ValueError, KeyError):
|
|
72
|
+
logger.warning(
|
|
73
|
+
f"Unable to get sparkProperties for {framework.id}: got response {response.text}"
|
|
74
|
+
)
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def guess_service(properties):
|
|
79
|
+
if not properties:
|
|
80
|
+
return None
|
|
81
|
+
for key, value in properties:
|
|
82
|
+
if key == "spark.executorEnv.PAASTA_SERVICE":
|
|
83
|
+
service = value
|
|
84
|
+
break
|
|
85
|
+
else:
|
|
86
|
+
return None
|
|
87
|
+
if service.startswith(JUPYTER_PREFIX):
|
|
88
|
+
return service[len(JUPYTER_PREFIX) :]
|
|
89
|
+
else:
|
|
90
|
+
return service
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_matching_framework_info(min_hours):
|
|
94
|
+
frameworks = mesos_tools.get_all_frameworks(active_only=True)
|
|
95
|
+
matching_info = []
|
|
96
|
+
min_timedelta = datetime.timedelta(hours=min_hours)
|
|
97
|
+
for framework in frameworks:
|
|
98
|
+
if not framework.active:
|
|
99
|
+
continue
|
|
100
|
+
if framework.get("principal") != "spark":
|
|
101
|
+
continue
|
|
102
|
+
time_running = get_time_running(framework)
|
|
103
|
+
if time_running >= min_timedelta:
|
|
104
|
+
info = {
|
|
105
|
+
"id": framework.id,
|
|
106
|
+
"name": framework.name,
|
|
107
|
+
"webui_url": framework.get("webui_url"),
|
|
108
|
+
"service": guess_service(get_spark_properties(framework)),
|
|
109
|
+
"user": framework.user,
|
|
110
|
+
"time_running": str(time_running),
|
|
111
|
+
}
|
|
112
|
+
matching_info.append(info)
|
|
113
|
+
|
|
114
|
+
return matching_info
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def format_framework(info):
|
|
118
|
+
result = [f'{info["name"]} (running for {info["time_running"]})']
|
|
119
|
+
result.append(f' user: {info["user"]}')
|
|
120
|
+
result.append(f' job UI: {info["webui_url"]}')
|
|
121
|
+
return "\n".join(result)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def format_message_for_service(service, frameworks):
|
|
125
|
+
output = f"Found the following long-running Spark frameworks associated with service {service}.\n"
|
|
126
|
+
output += (
|
|
127
|
+
f"Please check why they are still running and terminate if appropriate.\n\n"
|
|
128
|
+
)
|
|
129
|
+
output += "\n".join(format_framework(f) for f in frameworks)
|
|
130
|
+
return output
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_messages_by_service(frameworks):
|
|
134
|
+
frameworks_by_service = defaultdict(list)
|
|
135
|
+
for framework in frameworks:
|
|
136
|
+
service = framework["service"]
|
|
137
|
+
frameworks_by_service[service].append(framework)
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
service: format_message_for_service(service, frameworks)
|
|
141
|
+
for service, frameworks in frameworks_by_service.items()
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def update_check_status(service, output, status):
|
|
146
|
+
overrides = {
|
|
147
|
+
"page": False,
|
|
148
|
+
"alert_after": 0,
|
|
149
|
+
"tip": "Ask the user to check the job UI and terminate the job if appropriate.",
|
|
150
|
+
"runbook": "http://y/spark-debug",
|
|
151
|
+
"ticket": True,
|
|
152
|
+
}
|
|
153
|
+
send_event(
|
|
154
|
+
service=service,
|
|
155
|
+
check_name=f"long_running_spark_jobs.{service}",
|
|
156
|
+
overrides=overrides,
|
|
157
|
+
status=status,
|
|
158
|
+
output=output,
|
|
159
|
+
soa_dir=DEFAULT_SOA_DIR,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def email_user(framework_info, email_domain):
|
|
164
|
+
guessed_user = None
|
|
165
|
+
if framework_info["user"] != "root":
|
|
166
|
+
guessed_user = framework_info["user"]
|
|
167
|
+
elif framework_info["name"].startswith(JUPYTER_PREFIX):
|
|
168
|
+
try:
|
|
169
|
+
# the job format is now `<AppName>_<UserName>_<UIPort>_<StartTime>`
|
|
170
|
+
guessed_user = framework_info["name"].split("_")[-3]
|
|
171
|
+
except IndexError:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
if guessed_user:
|
|
175
|
+
print(
|
|
176
|
+
f'Guessed {framework_info["name"]} belongs to {guessed_user}, sending email'
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
print(f"Could not guess user from {framework_info}, skipping user email")
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
msg = EmailMessage()
|
|
183
|
+
msg["From"] = email_from_address
|
|
184
|
+
msg["To"] = f"{guessed_user}@{email_domain}"
|
|
185
|
+
msg["Subject"] = f'Long-running Spark framework {framework_info["name"]}'
|
|
186
|
+
content = "Please check why it is still running and terminate if appropriate.\n"
|
|
187
|
+
content += format_framework(framework_info)
|
|
188
|
+
msg.set_content(content)
|
|
189
|
+
with smtplib.SMTP("localhost") as s:
|
|
190
|
+
s.send_message(msg)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def report_spark_jobs(min_hours, no_notify, email_domain=None):
|
|
194
|
+
frameworks = get_matching_framework_info(min_hours=min_hours)
|
|
195
|
+
messages_by_service = get_messages_by_service(frameworks)
|
|
196
|
+
valid_services = set(list_services())
|
|
197
|
+
|
|
198
|
+
messages_for_unknown_services = []
|
|
199
|
+
for service, message in messages_by_service.items():
|
|
200
|
+
if service in valid_services:
|
|
201
|
+
print(f"{message}\n")
|
|
202
|
+
else:
|
|
203
|
+
messages_for_unknown_services.append(message)
|
|
204
|
+
if messages_for_unknown_services:
|
|
205
|
+
print("\nINVALID SERVICES")
|
|
206
|
+
print("----------------")
|
|
207
|
+
print(
|
|
208
|
+
"The following frameworks are associated with services that are not configured in PaaSTA.\n"
|
|
209
|
+
)
|
|
210
|
+
print("\n\n".join(messages_for_unknown_services))
|
|
211
|
+
|
|
212
|
+
if not no_notify:
|
|
213
|
+
for service in valid_services:
|
|
214
|
+
if service in messages_by_service:
|
|
215
|
+
update_check_status(service, message, pysensu_yelp.Status.WARNING)
|
|
216
|
+
else:
|
|
217
|
+
update_check_status(
|
|
218
|
+
service, "No long running spark jobs", pysensu_yelp.Status.OK
|
|
219
|
+
)
|
|
220
|
+
if email_domain:
|
|
221
|
+
for framework in frameworks:
|
|
222
|
+
email_user(framework, email_domain)
|
|
223
|
+
|
|
224
|
+
return 0 if len(frameworks) == 0 else 1
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def main():
|
|
228
|
+
args = parse_args()
|
|
229
|
+
logging.basicConfig()
|
|
230
|
+
return report_spark_jobs(args.min_hours, args.no_notify, args.email_domain)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
if __name__ == "__main__":
|
|
234
|
+
sys.exit(main())
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!python
|
|
2
|
+
# Copyright 2015-2018 Yelp Inc.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""
|
|
16
|
+
Usage: ./cleanup_kubernetes_cr.py [options]
|
|
17
|
+
|
|
18
|
+
Command line options:
|
|
19
|
+
|
|
20
|
+
- -d <SOA_DIR>, --soa-dir <SOA_DIR>: Specify a SOA config dir to read from
|
|
21
|
+
- -v, --verbose: Verbose output
|
|
22
|
+
"""
|
|
23
|
+
import argparse
|
|
24
|
+
import logging
|
|
25
|
+
import sys
|
|
26
|
+
from typing import Sequence
|
|
27
|
+
|
|
28
|
+
from paasta_tools.kubernetes_tools import CustomResourceDefinition
|
|
29
|
+
from paasta_tools.kubernetes_tools import delete_custom_resource
|
|
30
|
+
from paasta_tools.kubernetes_tools import KubeClient
|
|
31
|
+
from paasta_tools.kubernetes_tools import list_custom_resources
|
|
32
|
+
from paasta_tools.kubernetes_tools import load_custom_resource_definitions
|
|
33
|
+
from paasta_tools.kubernetes_tools import paasta_prefixed
|
|
34
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
35
|
+
from paasta_tools.utils import load_all_configs
|
|
36
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
37
|
+
|
|
38
|
+
log = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_args() -> argparse.Namespace:
|
|
42
|
+
parser = argparse.ArgumentParser(description="Cleanup custom_resources.")
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-d",
|
|
45
|
+
"--soa-dir",
|
|
46
|
+
dest="soa_dir",
|
|
47
|
+
metavar="SOA_DIR",
|
|
48
|
+
default=DEFAULT_SOA_DIR,
|
|
49
|
+
help="define a different soa config directory",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"-v", "--verbose", action="store_true", dest="verbose", default=False
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"-c", "--cluster", default=None, help="Cluster to cleanup CRs for"
|
|
56
|
+
)
|
|
57
|
+
args = parser.parse_args()
|
|
58
|
+
return args
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def main() -> None:
|
|
62
|
+
args = parse_args()
|
|
63
|
+
soa_dir = args.soa_dir
|
|
64
|
+
if args.verbose:
|
|
65
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
66
|
+
else:
|
|
67
|
+
logging.basicConfig(level=logging.INFO)
|
|
68
|
+
|
|
69
|
+
kube_client = KubeClient()
|
|
70
|
+
|
|
71
|
+
system_paasta_config = load_system_paasta_config()
|
|
72
|
+
cluster = args.cluster or system_paasta_config.get_cluster()
|
|
73
|
+
custom_resource_definitions = load_custom_resource_definitions(system_paasta_config)
|
|
74
|
+
cleanup_kube_succeeded = cleanup_all_custom_resources(
|
|
75
|
+
kube_client=kube_client,
|
|
76
|
+
soa_dir=soa_dir,
|
|
77
|
+
cluster=cluster,
|
|
78
|
+
custom_resource_definitions=custom_resource_definitions,
|
|
79
|
+
)
|
|
80
|
+
sys.exit(0 if cleanup_kube_succeeded else 1)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def cleanup_all_custom_resources(
|
|
84
|
+
kube_client: KubeClient,
|
|
85
|
+
soa_dir: str,
|
|
86
|
+
cluster: str,
|
|
87
|
+
custom_resource_definitions: Sequence[CustomResourceDefinition],
|
|
88
|
+
) -> bool:
|
|
89
|
+
cluster_crds = {
|
|
90
|
+
crd.spec.names.kind
|
|
91
|
+
for crd in kube_client.apiextensions.list_custom_resource_definition(
|
|
92
|
+
label_selector=paasta_prefixed("service")
|
|
93
|
+
).items
|
|
94
|
+
}
|
|
95
|
+
log.debug(f"CRDs found: {cluster_crds}")
|
|
96
|
+
results = []
|
|
97
|
+
for crd in custom_resource_definitions:
|
|
98
|
+
if crd.kube_kind.singular not in cluster_crds:
|
|
99
|
+
# TODO: kube_kind.singular seems to correspond to `crd.names.kind`
|
|
100
|
+
# and not `crd.names.singular`
|
|
101
|
+
log.warning(f"CRD {crd.kube_kind.singular} " f"not found in {cluster}")
|
|
102
|
+
continue
|
|
103
|
+
config_dicts = load_all_configs(
|
|
104
|
+
cluster=cluster, file_prefix=crd.file_prefix, soa_dir=soa_dir
|
|
105
|
+
)
|
|
106
|
+
if not config_dicts:
|
|
107
|
+
continue
|
|
108
|
+
crs = list_custom_resources(
|
|
109
|
+
kube_client=kube_client,
|
|
110
|
+
kind=crd.kube_kind,
|
|
111
|
+
version=crd.version,
|
|
112
|
+
group=crd.group,
|
|
113
|
+
)
|
|
114
|
+
for cr in crs:
|
|
115
|
+
service = config_dicts.get(cr.service)
|
|
116
|
+
if service is not None:
|
|
117
|
+
instance = service.get(cr.instance)
|
|
118
|
+
if instance is not None:
|
|
119
|
+
continue
|
|
120
|
+
result = False
|
|
121
|
+
try:
|
|
122
|
+
delete_custom_resource(
|
|
123
|
+
kube_client=kube_client,
|
|
124
|
+
name=cr.name,
|
|
125
|
+
namespace=cr.namespace,
|
|
126
|
+
plural=crd.kube_kind.plural,
|
|
127
|
+
version=crd.version,
|
|
128
|
+
group=crd.group,
|
|
129
|
+
)
|
|
130
|
+
result = True
|
|
131
|
+
except Exception:
|
|
132
|
+
log.exception("Error while deleting CR {cr.name}")
|
|
133
|
+
results.append(result)
|
|
134
|
+
return all(results) if results else True
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
main()
|