paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1448 @@
|
|
|
1
|
+
# Copyright 2015-2018 Yelp Inc.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
import datetime
|
|
14
|
+
import difflib
|
|
15
|
+
import glob
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import pkgutil
|
|
20
|
+
import re
|
|
21
|
+
import subprocess
|
|
22
|
+
from string import Formatter
|
|
23
|
+
from typing import cast
|
|
24
|
+
from typing import List
|
|
25
|
+
from typing import Mapping
|
|
26
|
+
from typing import Tuple
|
|
27
|
+
from typing import Union
|
|
28
|
+
|
|
29
|
+
from mypy_extensions import TypedDict
|
|
30
|
+
from service_configuration_lib import read_extra_service_information
|
|
31
|
+
from service_configuration_lib import read_yaml_file
|
|
32
|
+
from service_configuration_lib.spark_config import get_total_driver_memory_mb
|
|
33
|
+
from service_configuration_lib.spark_config import SparkConfBuilder
|
|
34
|
+
|
|
35
|
+
from paasta_tools import yaml_tools as yaml
|
|
36
|
+
from paasta_tools.mesos_tools import mesos_services_running_here
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from yaml.cyaml import CSafeDumper as Dumper
|
|
40
|
+
except ImportError: # pragma: no cover (no libyaml-dev / pypy)
|
|
41
|
+
Dumper = yaml.SafeDumper # type: ignore
|
|
42
|
+
|
|
43
|
+
from paasta_tools.clusterman import get_clusterman_metrics
|
|
44
|
+
from paasta_tools.tron.client import TronClient
|
|
45
|
+
from paasta_tools.tron import tron_command_context
|
|
46
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR, InstanceConfigDict
|
|
47
|
+
from paasta_tools.utils import InstanceConfig
|
|
48
|
+
from paasta_tools.utils import InvalidInstanceConfig
|
|
49
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
50
|
+
from paasta_tools.utils import SystemPaastaConfig
|
|
51
|
+
from paasta_tools.utils import load_v2_deployments_json
|
|
52
|
+
from paasta_tools.utils import NoConfigurationForServiceError
|
|
53
|
+
from paasta_tools.utils import NoDeploymentsAvailable
|
|
54
|
+
from paasta_tools.utils import time_cache
|
|
55
|
+
from paasta_tools.utils import filter_templates_from_config
|
|
56
|
+
from paasta_tools.utils import TronSecretVolume
|
|
57
|
+
from paasta_tools.utils import get_k8s_url_for_cluster
|
|
58
|
+
from paasta_tools.utils import validate_pool
|
|
59
|
+
from paasta_tools.utils import PoolsNotConfiguredError
|
|
60
|
+
from paasta_tools.utils import DockerVolume
|
|
61
|
+
from paasta_tools.utils import ProjectedSAVolume
|
|
62
|
+
|
|
63
|
+
from paasta_tools import spark_tools
|
|
64
|
+
|
|
65
|
+
from paasta_tools.kubernetes_tools import (
|
|
66
|
+
NodeSelectorConfig,
|
|
67
|
+
allowlist_denylist_to_requirements,
|
|
68
|
+
contains_zone_label,
|
|
69
|
+
get_service_account_name,
|
|
70
|
+
limit_size_with_hash,
|
|
71
|
+
raw_selectors_to_requirements,
|
|
72
|
+
to_node_label,
|
|
73
|
+
)
|
|
74
|
+
from paasta_tools.secret_tools import is_secret_ref
|
|
75
|
+
from paasta_tools.secret_tools import is_shared_secret
|
|
76
|
+
from paasta_tools.secret_tools import is_shared_secret_from_secret_name
|
|
77
|
+
from paasta_tools.secret_tools import get_secret_name_from_ref
|
|
78
|
+
from paasta_tools.kubernetes_tools import get_paasta_secret_name
|
|
79
|
+
from paasta_tools.kubernetes_tools import add_volumes_for_authenticating_services
|
|
80
|
+
from paasta_tools.secret_tools import SHARED_SECRET_SERVICE
|
|
81
|
+
|
|
82
|
+
from paasta_tools import monitoring_tools
|
|
83
|
+
from paasta_tools.monitoring_tools import list_teams
|
|
84
|
+
from typing import Optional
|
|
85
|
+
from typing import Dict
|
|
86
|
+
from typing import Any
|
|
87
|
+
|
|
88
|
+
log = logging.getLogger(__name__)
|
|
89
|
+
logging.getLogger("tron").setLevel(logging.WARNING)
|
|
90
|
+
|
|
91
|
+
MASTER_NAMESPACE = "MASTER"
|
|
92
|
+
SPACER = "."
|
|
93
|
+
VALID_MONITORING_KEYS = set(
|
|
94
|
+
json.loads(
|
|
95
|
+
pkgutil.get_data("paasta_tools.cli", "schemas/tron_schema.json").decode()
|
|
96
|
+
)["definitions"]["job"]["properties"]["monitoring"]["properties"].keys()
|
|
97
|
+
)
|
|
98
|
+
MESOS_EXECUTOR_NAMES = ("paasta",)
|
|
99
|
+
KUBERNETES_EXECUTOR_NAMES = ("paasta", "spark")
|
|
100
|
+
EXECUTOR_NAME_TO_TRON_EXECUTOR_TYPE = {"paasta": "kubernetes", "spark": "spark"}
|
|
101
|
+
KUBERNETES_NAMESPACE = "tron"
|
|
102
|
+
DEFAULT_AWS_REGION = "us-west-2"
|
|
103
|
+
EXECUTOR_TYPE_TO_NAMESPACE = {
|
|
104
|
+
"paasta": "tron",
|
|
105
|
+
"spark": "tron",
|
|
106
|
+
}
|
|
107
|
+
DEFAULT_TZ = "US/Pacific"
|
|
108
|
+
clusterman_metrics, _ = get_clusterman_metrics()
|
|
109
|
+
EXECUTOR_TYPES = ["paasta", "ssh", "spark"]
|
|
110
|
+
DEFAULT_SPARK_EXECUTOR_POOL = "batch"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class FieldSelectorConfig(TypedDict):
|
|
114
|
+
field_path: str
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class TronNotConfigured(Exception):
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class InvalidTronConfig(Exception):
|
|
122
|
+
pass
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class InvalidPoolError(Exception):
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class TronConfig(dict):
|
|
130
|
+
"""System-level configuration for Tron."""
|
|
131
|
+
|
|
132
|
+
def __init__(self, config):
|
|
133
|
+
super().__init__(config)
|
|
134
|
+
|
|
135
|
+
def get_cluster_name(self):
|
|
136
|
+
""":returns The name of the Tron cluster"""
|
|
137
|
+
try:
|
|
138
|
+
return self["cluster_name"]
|
|
139
|
+
except KeyError:
|
|
140
|
+
raise TronNotConfigured(
|
|
141
|
+
"Could not find name of Tron cluster in system Tron config"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def get_url(self):
|
|
145
|
+
""":returns The URL for the Tron master's API"""
|
|
146
|
+
try:
|
|
147
|
+
return self["url"]
|
|
148
|
+
except KeyError:
|
|
149
|
+
raise TronNotConfigured(
|
|
150
|
+
"Could not find URL of Tron master in system Tron config"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def get_tronfig_folder(cluster, soa_dir):
|
|
155
|
+
return os.path.join(soa_dir, "tron", cluster)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def load_tron_config():
|
|
159
|
+
return TronConfig(load_system_paasta_config().get_tron_config())
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def get_tron_client():
|
|
163
|
+
return TronClient(load_tron_config().get_url())
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def compose_instance(job, action):
|
|
167
|
+
return f"{job}{SPACER}{action}"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def decompose_instance(instance):
|
|
171
|
+
"""Get (job_name, action_name) from an instance."""
|
|
172
|
+
decomposed = instance.split(SPACER)
|
|
173
|
+
if len(decomposed) != 2:
|
|
174
|
+
raise InvalidInstanceConfig("Invalid instance name: %s" % instance)
|
|
175
|
+
return (decomposed[0], decomposed[1])
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def decompose_executor_id(executor_id) -> Tuple[str, str, int, str]:
|
|
179
|
+
"""(service, job, run_number, action)"""
|
|
180
|
+
service, job, str_run_number, action, _ = executor_id.split(SPACER)
|
|
181
|
+
return (service, job, int(str_run_number), action)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class StringFormatter(Formatter):
|
|
185
|
+
def __init__(self, context=None):
|
|
186
|
+
Formatter.__init__(self)
|
|
187
|
+
self.context = context
|
|
188
|
+
|
|
189
|
+
def get_value(self, key, args, kwds):
|
|
190
|
+
if isinstance(key, str):
|
|
191
|
+
try:
|
|
192
|
+
return kwds[key]
|
|
193
|
+
except KeyError:
|
|
194
|
+
return self.context[key]
|
|
195
|
+
else:
|
|
196
|
+
return Formatter.get_value(key, args, kwds)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def parse_time_variables(command: str, parse_time: datetime.datetime = None) -> str:
|
|
200
|
+
"""Parses an input string and uses the Tron-style dateparsing
|
|
201
|
+
to replace time variables. Currently supports only the date/time
|
|
202
|
+
variables listed in the tron documentation:
|
|
203
|
+
http://tron.readthedocs.io/en/latest/command_context.html#built-in-cc
|
|
204
|
+
|
|
205
|
+
:param input_string: input string to be parsed
|
|
206
|
+
:param parse_time: Reference Datetime object to parse the date and time strings, defaults to now.
|
|
207
|
+
:returns: A string with the date and time variables replaced
|
|
208
|
+
"""
|
|
209
|
+
if parse_time is None:
|
|
210
|
+
parse_time = datetime.datetime.now()
|
|
211
|
+
# We build up a tron context object that has the right
|
|
212
|
+
# methods to parse tron-style time syntax
|
|
213
|
+
job_context = tron_command_context.JobRunContext(
|
|
214
|
+
tron_command_context.CommandContext()
|
|
215
|
+
)
|
|
216
|
+
# The tron context object needs the run_time attribute set so it knows
|
|
217
|
+
# how to interpret the date strings
|
|
218
|
+
job_context.job_run.run_time = parse_time
|
|
219
|
+
return StringFormatter(job_context).format(command)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _get_tron_k8s_cluster_override(cluster: str) -> Optional[str]:
|
|
223
|
+
"""
|
|
224
|
+
Return the name of a compute cluster if there's a different compute cluster that should be used to run a Tronjob.
|
|
225
|
+
Will return None if no override mapping is present
|
|
226
|
+
|
|
227
|
+
We have certain Tron masters that are named differently from the compute cluster that should actually be used (
|
|
228
|
+
e.g., we might have tron-XYZ-test-prod, but instead of scheduling on XYZ-test-prod, we'd like to schedule jobs
|
|
229
|
+
on test-prod).
|
|
230
|
+
|
|
231
|
+
To control this, we have an optional config item that we'll puppet onto Tron masters that need this type of
|
|
232
|
+
tron master -> compute cluster override which this function will read.
|
|
233
|
+
"""
|
|
234
|
+
return (
|
|
235
|
+
load_system_paasta_config()
|
|
236
|
+
.get_tron_k8s_cluster_overrides()
|
|
237
|
+
.get(
|
|
238
|
+
cluster,
|
|
239
|
+
None,
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _spark_k8s_role() -> str:
|
|
245
|
+
return load_system_paasta_config().get_spark_k8s_role()
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class TronActionConfigDict(InstanceConfigDict, total=False):
|
|
249
|
+
# this is kinda confusing: long-running stuff is currently using cmd
|
|
250
|
+
# ...but tron are using command - this is going to require a little
|
|
251
|
+
# maneuvering to unify
|
|
252
|
+
command: str
|
|
253
|
+
service_account_name: str
|
|
254
|
+
node_selectors: Dict[str, NodeSelectorConfig]
|
|
255
|
+
|
|
256
|
+
# the values for this dict can be anything since it's whatever
|
|
257
|
+
# spark accepts
|
|
258
|
+
spark_args: Dict[str, Any]
|
|
259
|
+
force_spark_resource_configs: bool
|
|
260
|
+
# TODO: TRON-2145: use this to implement timeout for non-spark actions in tron
|
|
261
|
+
max_runtime: str
|
|
262
|
+
mrjob: bool
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class TronActionConfig(InstanceConfig):
|
|
266
|
+
config_dict: TronActionConfigDict
|
|
267
|
+
config_filename_prefix = "tron"
|
|
268
|
+
|
|
269
|
+
def __init__(
|
|
270
|
+
self,
|
|
271
|
+
service,
|
|
272
|
+
instance,
|
|
273
|
+
cluster,
|
|
274
|
+
config_dict,
|
|
275
|
+
branch_dict,
|
|
276
|
+
soa_dir=DEFAULT_SOA_DIR,
|
|
277
|
+
for_validation=False,
|
|
278
|
+
):
|
|
279
|
+
super().__init__(
|
|
280
|
+
cluster=cluster,
|
|
281
|
+
instance=instance,
|
|
282
|
+
service=service,
|
|
283
|
+
config_dict=config_dict,
|
|
284
|
+
branch_dict=branch_dict,
|
|
285
|
+
soa_dir=soa_dir,
|
|
286
|
+
)
|
|
287
|
+
self.job, self.action = decompose_instance(instance)
|
|
288
|
+
|
|
289
|
+
# Indicate whether this config object is created for validation
|
|
290
|
+
self.for_validation = for_validation
|
|
291
|
+
|
|
292
|
+
self.action_spark_config = None
|
|
293
|
+
if self.get_executor() == "spark":
|
|
294
|
+
# build the complete Spark configuration
|
|
295
|
+
# TODO: add conditional check for Spark specific commands spark-submit, pyspark etc ?
|
|
296
|
+
self.action_spark_config = self.build_spark_config()
|
|
297
|
+
|
|
298
|
+
def get_cpus(self) -> float:
|
|
299
|
+
# set Spark driver pod CPU if it is specified by Spark arguments
|
|
300
|
+
if (
|
|
301
|
+
self.action_spark_config
|
|
302
|
+
and "spark.driver.cores" in self.action_spark_config
|
|
303
|
+
):
|
|
304
|
+
return float(self.action_spark_config["spark.driver.cores"])
|
|
305
|
+
# we fall back to this default if there's no spark.driver.cores config
|
|
306
|
+
return super().get_cpus()
|
|
307
|
+
|
|
308
|
+
def get_mem(self) -> float:
|
|
309
|
+
# get Spark driver pod memory specified by Spark arguments
|
|
310
|
+
if self.action_spark_config:
|
|
311
|
+
return get_total_driver_memory_mb(self.action_spark_config)
|
|
312
|
+
# we fall back to this default if there's no Spark config
|
|
313
|
+
return super().get_mem()
|
|
314
|
+
|
|
315
|
+
def get_disk(self, default: float = 1024) -> float:
|
|
316
|
+
# increase default threshold for Spark driver pod memory because 1G is too low
|
|
317
|
+
if self.action_spark_config and "disk" not in self.config_dict:
|
|
318
|
+
return spark_tools.SPARK_DRIVER_DEFAULT_DISK_MB
|
|
319
|
+
# we fall back to this default if there's no Spark config
|
|
320
|
+
return super().get_disk()
|
|
321
|
+
|
|
322
|
+
def build_spark_config(self) -> Dict[str, str]:
|
|
323
|
+
system_paasta_config = load_system_paasta_config()
|
|
324
|
+
resolved_cluster = system_paasta_config.get_eks_cluster_aliases().get(
|
|
325
|
+
self.get_cluster(), self.get_cluster()
|
|
326
|
+
)
|
|
327
|
+
pool = self.get_spark_executor_pool()
|
|
328
|
+
try:
|
|
329
|
+
if not validate_pool(resolved_cluster, pool, system_paasta_config):
|
|
330
|
+
raise InvalidPoolError(
|
|
331
|
+
f"Job {self.get_service()}.{self.get_instance()}: "
|
|
332
|
+
f"pool '{pool}' is invalid for cluster '{resolved_cluster}'"
|
|
333
|
+
)
|
|
334
|
+
except PoolsNotConfiguredError:
|
|
335
|
+
log.warning(
|
|
336
|
+
f"Could not fetch allowed_pools for `{resolved_cluster}`. Skipping pool validation.\n"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
spark_args = self.config_dict.get("spark_args", {})
|
|
340
|
+
# most of the service_configuration_lib function expected string values only
|
|
341
|
+
# so let's go ahead and convert the values now instead of once per-wrapper
|
|
342
|
+
stringified_spark_args = {
|
|
343
|
+
k: (str(v) if not isinstance(v, bool) else str(v).lower())
|
|
344
|
+
for k, v in spark_args.items()
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
spark_app_name = stringified_spark_args.get(
|
|
348
|
+
"spark.app.name",
|
|
349
|
+
f"tron_spark_{self.get_service()}_{self.get_instance()}",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
spark_conf_builder = SparkConfBuilder(is_driver_on_k8s_tron=True)
|
|
353
|
+
spark_conf = spark_conf_builder.get_spark_conf(
|
|
354
|
+
cluster_manager="kubernetes",
|
|
355
|
+
spark_app_base_name=spark_app_name,
|
|
356
|
+
user_spark_opts=stringified_spark_args,
|
|
357
|
+
paasta_cluster=resolved_cluster,
|
|
358
|
+
paasta_pool=self.get_spark_executor_pool(),
|
|
359
|
+
paasta_service=self.get_service(),
|
|
360
|
+
paasta_instance=self.get_instance(),
|
|
361
|
+
docker_img=f"{self.get_docker_registry()}/$PAASTA_DOCKER_IMAGE",
|
|
362
|
+
extra_volumes=cast(
|
|
363
|
+
List[Mapping[str, str]],
|
|
364
|
+
self.get_volumes(
|
|
365
|
+
system_paasta_config.get_volumes(),
|
|
366
|
+
),
|
|
367
|
+
),
|
|
368
|
+
use_eks=True,
|
|
369
|
+
k8s_server_address=get_k8s_url_for_cluster(self.get_cluster()),
|
|
370
|
+
force_spark_resource_configs=self.config_dict.get(
|
|
371
|
+
"force_spark_resource_configs", False
|
|
372
|
+
),
|
|
373
|
+
user=spark_tools.SPARK_TRON_JOB_USER,
|
|
374
|
+
)
|
|
375
|
+
# delete the dynamically generated spark.app.id to prevent frequent config updates in Tron.
|
|
376
|
+
# spark.app.id will be generated later by yelp spark-submit wrapper or Spark itself.
|
|
377
|
+
spark_conf.pop("spark.app.id", None)
|
|
378
|
+
# use a static spark.app.name to prevent frequent config updates in Tron.
|
|
379
|
+
# md5 and base64 will always generate the same encoding for a string.
|
|
380
|
+
# This spark.app.name might be overridden by yelp spark-submit wrapper.
|
|
381
|
+
if "spark.app.name" in spark_conf:
|
|
382
|
+
spark_conf["spark.app.name"] = limit_size_with_hash(
|
|
383
|
+
f"tron_spark_{self.get_service()}_{self.get_instance()}_{self.get_action_name()}"
|
|
384
|
+
if "spark.app.name" not in stringified_spark_args
|
|
385
|
+
else stringified_spark_args["spark.app.name"]
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# TODO(MLCOMPUTE-1220): Remove this once dynamic pod template is generated inside the driver using spark-submit wrapper
|
|
389
|
+
if "spark.kubernetes.executor.podTemplateFile" in spark_conf:
|
|
390
|
+
log.info(
|
|
391
|
+
f"Replacing spark.kubernetes.executor.podTemplateFile="
|
|
392
|
+
f"{spark_conf['spark.kubernetes.executor.podTemplateFile']} with "
|
|
393
|
+
f"spark.kubernetes.executor.podTemplateFile={spark_tools.SPARK_DNS_POD_TEMPLATE}"
|
|
394
|
+
)
|
|
395
|
+
spark_conf[
|
|
396
|
+
"spark.kubernetes.executor.podTemplateFile"
|
|
397
|
+
] = spark_tools.SPARK_DNS_POD_TEMPLATE
|
|
398
|
+
|
|
399
|
+
spark_conf.update(
|
|
400
|
+
{
|
|
401
|
+
"spark.hadoop.fs.s3a.aws.credentials.provider": spark_tools.SPARK_AWS_CREDS_PROVIDER,
|
|
402
|
+
"spark.driver.host": "$PAASTA_POD_IP",
|
|
403
|
+
}
|
|
404
|
+
)
|
|
405
|
+
spark_conf.setdefault(
|
|
406
|
+
"spark.kubernetes.executor.label.yelp.com/owner", self.get_team()
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# We are using the Service Account created using the provided or default IAM role.
|
|
410
|
+
spark_conf[
|
|
411
|
+
"spark.kubernetes.authenticate.executor.serviceAccountName"
|
|
412
|
+
] = get_service_account_name(
|
|
413
|
+
iam_role=self.get_spark_executor_iam_role(),
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
return spark_conf
|
|
417
|
+
|
|
418
|
+
def get_cmd(self):
|
|
419
|
+
command = self.config_dict.get("command")
|
|
420
|
+
return command
|
|
421
|
+
|
|
422
|
+
def get_job_name(self):
|
|
423
|
+
return self.job
|
|
424
|
+
|
|
425
|
+
def get_action_name(self):
|
|
426
|
+
return self.action
|
|
427
|
+
|
|
428
|
+
# mypy does not like the SecretVolume -> TronSecretVolume conversion, because TypedDict inheritence is broken.
|
|
429
|
+
# Until this is fixed, let's ignore this issue.
|
|
430
|
+
def get_secret_volumes(self) -> List[TronSecretVolume]: # type: ignore
|
|
431
|
+
"""Adds the secret_volume_name to the object so tron/task_processing can load it downstream without replicating code."""
|
|
432
|
+
secret_volumes = super().get_secret_volumes()
|
|
433
|
+
tron_secret_volumes = []
|
|
434
|
+
for secret_volume in secret_volumes:
|
|
435
|
+
tron_secret_volume = TronSecretVolume(
|
|
436
|
+
secret_volume_name=self.get_secret_volume_name(
|
|
437
|
+
secret_volume["secret_name"]
|
|
438
|
+
),
|
|
439
|
+
secret_name=secret_volume["secret_name"],
|
|
440
|
+
container_path=secret_volume["container_path"],
|
|
441
|
+
items=secret_volume.get("items", []),
|
|
442
|
+
)
|
|
443
|
+
# we have a different place where the default can come from (tron) and we don't want to insert the wrong default here
|
|
444
|
+
if "default_mode" in secret_volume:
|
|
445
|
+
tron_secret_volume["default_mode"] = secret_volume["default_mode"]
|
|
446
|
+
|
|
447
|
+
tron_secret_volumes.append(tron_secret_volume)
|
|
448
|
+
return tron_secret_volumes
|
|
449
|
+
|
|
450
|
+
def get_namespace(self) -> str:
|
|
451
|
+
"""Get namespace from config, default to 'paasta'"""
|
|
452
|
+
return self.config_dict.get("namespace", KUBERNETES_NAMESPACE)
|
|
453
|
+
|
|
454
|
+
def get_secret_volume_name(self, secret_name: str) -> str:
|
|
455
|
+
service = (
|
|
456
|
+
self.service
|
|
457
|
+
if not is_shared_secret_from_secret_name(
|
|
458
|
+
soa_dir=self.soa_dir, secret_name=secret_name
|
|
459
|
+
)
|
|
460
|
+
else SHARED_SECRET_SERVICE
|
|
461
|
+
)
|
|
462
|
+
return get_paasta_secret_name(
|
|
463
|
+
self.get_namespace(),
|
|
464
|
+
service,
|
|
465
|
+
secret_name,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
def get_deploy_group(self) -> Optional[str]:
|
|
469
|
+
return self.config_dict.get("deploy_group", None)
|
|
470
|
+
|
|
471
|
+
def get_docker_url(
|
|
472
|
+
self, system_paasta_config: Optional[SystemPaastaConfig] = None
|
|
473
|
+
) -> str:
|
|
474
|
+
# It's okay for tronfig to contain things that aren't deployed yet - it's normal for developers to
|
|
475
|
+
# push tronfig well before the job is scheduled to run, and either they'll deploy the service before
|
|
476
|
+
# or get notified when the job fails.
|
|
477
|
+
#
|
|
478
|
+
# This logic ensures that we can still pass validation and run setup_tron_namespace even if
|
|
479
|
+
# there's nothing in deployments.json yet.
|
|
480
|
+
return (
|
|
481
|
+
""
|
|
482
|
+
if not self.get_docker_image()
|
|
483
|
+
else super().get_docker_url(system_paasta_config=system_paasta_config)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
def get_env(
|
|
487
|
+
self,
|
|
488
|
+
system_paasta_config: Optional["SystemPaastaConfig"] = None,
|
|
489
|
+
) -> Dict[str, str]:
|
|
490
|
+
env = super().get_env(system_paasta_config=system_paasta_config)
|
|
491
|
+
if self.get_executor() == "spark":
|
|
492
|
+
# Required by some sdks like boto3 client. Throws NoRegionError otherwise.
|
|
493
|
+
# AWS_REGION takes precedence if set.
|
|
494
|
+
env["AWS_DEFAULT_REGION"] = DEFAULT_AWS_REGION
|
|
495
|
+
env["PAASTA_INSTANCE_TYPE"] = "spark"
|
|
496
|
+
# XXX: is this actually necessary? every PR that's added this hasn't really mentioned why,
|
|
497
|
+
# and Chesterton's Fence makes me very wary about removing it
|
|
498
|
+
env["SPARK_USER"] = "root"
|
|
499
|
+
# XXX: we were adding the commandline we were starting the Spark driver with to SPARK_OPTS
|
|
500
|
+
# before, but that doesn't really seem necessary from my testing (driver starts just fine)
|
|
501
|
+
# if this changes and we do need it - please add a comment about *why* we need it!
|
|
502
|
+
# XXX: update PAASTA_RESOURCE_* env vars to use the correct value from spark_args and set
|
|
503
|
+
# these to the correct values for the executors as part of the driver commandline
|
|
504
|
+
|
|
505
|
+
return env
|
|
506
|
+
|
|
507
|
+
def get_iam_role(self) -> str:
|
|
508
|
+
iam_role = super().get_iam_role()
|
|
509
|
+
|
|
510
|
+
if not iam_role and self.get_executor() == "spark":
|
|
511
|
+
iam_role = load_system_paasta_config().get_spark_driver_iam_role()
|
|
512
|
+
|
|
513
|
+
return iam_role
|
|
514
|
+
|
|
515
|
+
def get_spark_executor_iam_role(self) -> str:
|
|
516
|
+
return (
|
|
517
|
+
self.get_iam_role()
|
|
518
|
+
or load_system_paasta_config().get_spark_executor_iam_role()
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
def get_secret_env(self) -> Mapping[str, dict]:
|
|
522
|
+
base_env = self.config_dict.get("env", {})
|
|
523
|
+
secret_env = {}
|
|
524
|
+
for k, v in base_env.items():
|
|
525
|
+
if is_secret_ref(v):
|
|
526
|
+
secret = get_secret_name_from_ref(v)
|
|
527
|
+
service = (
|
|
528
|
+
self.service if not is_shared_secret(v) else SHARED_SECRET_SERVICE
|
|
529
|
+
)
|
|
530
|
+
secret_env[k] = {
|
|
531
|
+
"secret_name": get_paasta_secret_name(
|
|
532
|
+
self.get_namespace(),
|
|
533
|
+
service,
|
|
534
|
+
secret,
|
|
535
|
+
),
|
|
536
|
+
"key": secret,
|
|
537
|
+
}
|
|
538
|
+
return secret_env
|
|
539
|
+
|
|
540
|
+
def get_field_selector_env(self) -> Dict[str, FieldSelectorConfig]:
|
|
541
|
+
# we're not expecting users to need to add any of these themselves, so for now
|
|
542
|
+
# we'll just hardcode the env vars we want to add by default
|
|
543
|
+
return {
|
|
544
|
+
"PAASTA_POD_IP": {
|
|
545
|
+
"field_path": "status.podIP",
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
def get_cpu_burst_add(self) -> float:
|
|
550
|
+
"""For Tron jobs, we don't let them burst by default, because they
|
|
551
|
+
don't represent "real-time" workloads, and should not impact
|
|
552
|
+
neighbors"""
|
|
553
|
+
return self.config_dict.get("cpu_burst_add", 0)
|
|
554
|
+
|
|
555
|
+
def get_executor(self):
|
|
556
|
+
return self.config_dict.get("executor", "paasta")
|
|
557
|
+
|
|
558
|
+
def get_healthcheck_mode(self, _) -> None:
|
|
559
|
+
return None
|
|
560
|
+
|
|
561
|
+
def get_node(self):
|
|
562
|
+
return self.config_dict.get("node")
|
|
563
|
+
|
|
564
|
+
def get_retries(self):
|
|
565
|
+
return self.config_dict.get("retries")
|
|
566
|
+
|
|
567
|
+
def get_retries_delay(self):
|
|
568
|
+
return self.config_dict.get("retries_delay")
|
|
569
|
+
|
|
570
|
+
def get_requires(self):
|
|
571
|
+
return self.config_dict.get("requires")
|
|
572
|
+
|
|
573
|
+
def get_expected_runtime(self):
|
|
574
|
+
return self.config_dict.get("expected_runtime")
|
|
575
|
+
|
|
576
|
+
def get_triggered_by(self):
|
|
577
|
+
return self.config_dict.get("triggered_by", None)
|
|
578
|
+
|
|
579
|
+
def get_trigger_downstreams(self):
|
|
580
|
+
return self.config_dict.get("trigger_downstreams", None)
|
|
581
|
+
|
|
582
|
+
def get_on_upstream_rerun(self):
|
|
583
|
+
return self.config_dict.get("on_upstream_rerun", None)
|
|
584
|
+
|
|
585
|
+
def get_trigger_timeout(self):
|
|
586
|
+
return self.config_dict.get("trigger_timeout", None)
|
|
587
|
+
|
|
588
|
+
def get_node_selectors(self) -> Dict[str, str]:
|
|
589
|
+
raw_selectors: Dict[str, Any] = self.config_dict.get("node_selectors", {}) # type: ignore
|
|
590
|
+
node_selectors = {
|
|
591
|
+
to_node_label(label): value
|
|
592
|
+
for label, value in raw_selectors.items()
|
|
593
|
+
if isinstance(value, str)
|
|
594
|
+
}
|
|
595
|
+
node_selectors["yelp.com/pool"] = self.get_pool()
|
|
596
|
+
return node_selectors
|
|
597
|
+
|
|
598
|
+
def get_node_affinities(self) -> Optional[List[Dict[str, Union[str, List[str]]]]]:
|
|
599
|
+
"""Converts deploy_whitelist and deploy_blacklist in node affinities.
|
|
600
|
+
|
|
601
|
+
NOTE: At the time of writing, `kubectl describe` does not show affinities,
|
|
602
|
+
only selectors. To see affinities, use `kubectl get pod -o json` instead.
|
|
603
|
+
|
|
604
|
+
WARNING: At the time of writing, we only used requiredDuringSchedulingIgnoredDuringExecution node affinities in Tron as we currently have
|
|
605
|
+
no use case for preferredDuringSchedulingIgnoredDuringExecution node affinities.
|
|
606
|
+
"""
|
|
607
|
+
requirements = allowlist_denylist_to_requirements(
|
|
608
|
+
allowlist=self.get_deploy_whitelist(),
|
|
609
|
+
denylist=self.get_deploy_blacklist(),
|
|
610
|
+
)
|
|
611
|
+
node_selectors = self.config_dict.get("node_selectors", {})
|
|
612
|
+
requirements.extend(
|
|
613
|
+
raw_selectors_to_requirements(
|
|
614
|
+
raw_selectors=node_selectors,
|
|
615
|
+
)
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
system_paasta_config = load_system_paasta_config()
|
|
619
|
+
if system_paasta_config.get_enable_tron_tsc():
|
|
620
|
+
# PAASTA-18198: To improve AZ balance with Karpenter, we temporarily allow specifying zone affinities per pool
|
|
621
|
+
pool_node_affinities = system_paasta_config.get_pool_node_affinities()
|
|
622
|
+
if pool_node_affinities and self.get_pool() in pool_node_affinities:
|
|
623
|
+
current_pool_node_affinities = pool_node_affinities[self.get_pool()]
|
|
624
|
+
# If the service already has a node selector for a zone, we don't want to override it
|
|
625
|
+
if current_pool_node_affinities and not contains_zone_label(
|
|
626
|
+
node_selectors
|
|
627
|
+
):
|
|
628
|
+
requirements.extend(
|
|
629
|
+
raw_selectors_to_requirements(
|
|
630
|
+
raw_selectors=current_pool_node_affinities,
|
|
631
|
+
)
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if not requirements:
|
|
635
|
+
return None
|
|
636
|
+
|
|
637
|
+
return [
|
|
638
|
+
{"key": key, "operator": op, "value": value}
|
|
639
|
+
for key, op, value in requirements
|
|
640
|
+
]
|
|
641
|
+
|
|
642
|
+
def get_calculated_constraints(self):
|
|
643
|
+
"""Combine all configured Mesos constraints."""
|
|
644
|
+
constraints = self.get_constraints()
|
|
645
|
+
if constraints is not None:
|
|
646
|
+
return constraints
|
|
647
|
+
else:
|
|
648
|
+
constraints = self.get_extra_constraints()
|
|
649
|
+
constraints.extend(
|
|
650
|
+
self.get_deploy_constraints(
|
|
651
|
+
blacklist=self.get_deploy_blacklist(),
|
|
652
|
+
whitelist=self.get_deploy_whitelist(),
|
|
653
|
+
# Don't have configs for the paasta cluster
|
|
654
|
+
system_deploy_blacklist=[],
|
|
655
|
+
system_deploy_whitelist=None,
|
|
656
|
+
)
|
|
657
|
+
)
|
|
658
|
+
constraints.extend(self.get_pool_constraints())
|
|
659
|
+
return constraints
|
|
660
|
+
|
|
661
|
+
def get_nerve_namespace(self) -> None:
|
|
662
|
+
return None
|
|
663
|
+
|
|
664
|
+
def validate(self):
|
|
665
|
+
error_msgs = []
|
|
666
|
+
error_msgs.extend(super().validate())
|
|
667
|
+
# Tron is a little special, because it can *not* have a deploy group
|
|
668
|
+
# But only if an action is running via ssh and not via paasta
|
|
669
|
+
if (
|
|
670
|
+
self.get_deploy_group() is None
|
|
671
|
+
and self.get_executor() in MESOS_EXECUTOR_NAMES
|
|
672
|
+
):
|
|
673
|
+
error_msgs.append(
|
|
674
|
+
f"{self.get_job_name()}.{self.get_action_name()} must have a deploy_group set"
|
|
675
|
+
)
|
|
676
|
+
# We are not allowing users to specify `cpus` and `mem` configuration if the action is a Spark job
|
|
677
|
+
# with driver running on k8s (executor: spark), because we derive these values from `spark.driver.cores`
|
|
678
|
+
# and `spark.driver.memory` in order to avoid confusion.
|
|
679
|
+
if self.get_executor() == "spark":
|
|
680
|
+
if "cpus" in self.config_dict:
|
|
681
|
+
error_msgs.append(
|
|
682
|
+
f"{self.get_job_name()}.{self.get_action_name()} is a Spark job. `cpus` config is not allowed. "
|
|
683
|
+
f"Please specify the driver cores using `spark.driver.cores`."
|
|
684
|
+
)
|
|
685
|
+
if "mem" in self.config_dict:
|
|
686
|
+
error_msgs.append(
|
|
687
|
+
f"{self.get_job_name()}.{self.get_action_name()} is a Spark job. `mem` config is not allowed. "
|
|
688
|
+
f"Please specify the driver memory using `spark.driver.memory`."
|
|
689
|
+
)
|
|
690
|
+
return error_msgs
|
|
691
|
+
|
|
692
|
+
def get_pool(self) -> str:
|
|
693
|
+
"""
|
|
694
|
+
Returns the default pool override if pool is not defined in the action configuration.
|
|
695
|
+
|
|
696
|
+
This is useful for environments like spam to allow us to default the pool to spam but allow users to
|
|
697
|
+
override this value. To control this, we have an optional config item that we'll puppet onto Tron masters
|
|
698
|
+
which this function will read.
|
|
699
|
+
"""
|
|
700
|
+
if self.get_executor() == "spark":
|
|
701
|
+
pool = load_system_paasta_config().get_default_spark_driver_pool_override()
|
|
702
|
+
else:
|
|
703
|
+
pool = self.config_dict.get(
|
|
704
|
+
"pool", load_system_paasta_config().get_tron_default_pool_override()
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
return pool
|
|
708
|
+
|
|
709
|
+
def get_spark_executor_pool(self) -> str:
|
|
710
|
+
return self.config_dict.get("pool", DEFAULT_SPARK_EXECUTOR_POOL)
|
|
711
|
+
|
|
712
|
+
def get_service_account_name(self) -> Optional[str]:
|
|
713
|
+
return self.config_dict.get("service_account_name")
|
|
714
|
+
|
|
715
|
+
def get_projected_sa_volumes(self) -> Optional[List[ProjectedSAVolume]]:
|
|
716
|
+
projected_volumes = add_volumes_for_authenticating_services(
|
|
717
|
+
service_name=self.service,
|
|
718
|
+
config_volumes=super().get_projected_sa_volumes(),
|
|
719
|
+
soa_dir=self.soa_dir,
|
|
720
|
+
)
|
|
721
|
+
return projected_volumes if projected_volumes else None
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
class TronJobConfig:
|
|
725
|
+
"""Represents a job in Tron, consisting of action(s) and job-level configuration values."""
|
|
726
|
+
|
|
727
|
+
def __init__(
|
|
728
|
+
self,
|
|
729
|
+
name: str,
|
|
730
|
+
config_dict: Dict[str, Any],
|
|
731
|
+
cluster: str,
|
|
732
|
+
service: Optional[str] = None,
|
|
733
|
+
load_deployments: bool = True,
|
|
734
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
735
|
+
for_validation: bool = False,
|
|
736
|
+
) -> None:
|
|
737
|
+
self.name = name
|
|
738
|
+
self.config_dict = config_dict
|
|
739
|
+
self.cluster = cluster
|
|
740
|
+
self.service = service
|
|
741
|
+
self.load_deployments = load_deployments
|
|
742
|
+
self.soa_dir = soa_dir
|
|
743
|
+
# Indicate whether this config object is created for validation
|
|
744
|
+
self.for_validation = for_validation
|
|
745
|
+
|
|
746
|
+
def get_name(self):
|
|
747
|
+
return self.name
|
|
748
|
+
|
|
749
|
+
def get_node(self):
|
|
750
|
+
return self.config_dict.get("node", "paasta")
|
|
751
|
+
|
|
752
|
+
def get_schedule(self):
|
|
753
|
+
return self.config_dict.get("schedule")
|
|
754
|
+
|
|
755
|
+
def get_cron_expression(self) -> Optional[str]:
|
|
756
|
+
schedule = self.config_dict.get("schedule")
|
|
757
|
+
# TODO(TRON-1746): once we simplify this format, we can clean this code up
|
|
758
|
+
if (
|
|
759
|
+
isinstance(schedule, dict)
|
|
760
|
+
and "type" in schedule
|
|
761
|
+
and schedule["type"] == "cron"
|
|
762
|
+
):
|
|
763
|
+
return schedule["value"]
|
|
764
|
+
elif isinstance(schedule, str) and schedule.startswith("cron"):
|
|
765
|
+
# most cron parsers won't understand our schedule tag, so we need to strip
|
|
766
|
+
# that off before passing it to anything else
|
|
767
|
+
return schedule.replace("cron", "")
|
|
768
|
+
|
|
769
|
+
return None
|
|
770
|
+
|
|
771
|
+
def get_monitoring(self):
|
|
772
|
+
srv_monitoring = dict(
|
|
773
|
+
monitoring_tools.read_monitoring_config(self.service, soa_dir=self.soa_dir)
|
|
774
|
+
)
|
|
775
|
+
tron_monitoring = self.config_dict.get("monitoring", {})
|
|
776
|
+
srv_monitoring.update(tron_monitoring)
|
|
777
|
+
# filter out non-tron monitoring keys
|
|
778
|
+
srv_monitoring = {
|
|
779
|
+
k: v for k, v in srv_monitoring.items() if k in VALID_MONITORING_KEYS
|
|
780
|
+
}
|
|
781
|
+
return srv_monitoring
|
|
782
|
+
|
|
783
|
+
def get_queueing(self):
|
|
784
|
+
return self.config_dict.get("queueing")
|
|
785
|
+
|
|
786
|
+
def get_run_limit(self):
|
|
787
|
+
return self.config_dict.get("run_limit")
|
|
788
|
+
|
|
789
|
+
def get_all_nodes(self):
|
|
790
|
+
return self.config_dict.get("all_nodes")
|
|
791
|
+
|
|
792
|
+
def get_enabled(self):
|
|
793
|
+
return self.config_dict.get("enabled")
|
|
794
|
+
|
|
795
|
+
def get_allow_overlap(self):
|
|
796
|
+
return self.config_dict.get("allow_overlap")
|
|
797
|
+
|
|
798
|
+
def get_max_runtime(self):
|
|
799
|
+
return self.config_dict.get("max_runtime")
|
|
800
|
+
|
|
801
|
+
def get_time_zone(self):
|
|
802
|
+
return self.config_dict.get("time_zone")
|
|
803
|
+
|
|
804
|
+
def get_service(self) -> Optional[str]:
|
|
805
|
+
return self.service or self.config_dict.get("service")
|
|
806
|
+
|
|
807
|
+
def get_deploy_group(self) -> Optional[str]:
|
|
808
|
+
return self.config_dict.get("deploy_group", None)
|
|
809
|
+
|
|
810
|
+
def get_cluster(self):
|
|
811
|
+
return self.cluster
|
|
812
|
+
|
|
813
|
+
def get_expected_runtime(self):
|
|
814
|
+
return self.config_dict.get("expected_runtime")
|
|
815
|
+
|
|
816
|
+
def _get_action_config(self, action_name, action_dict) -> TronActionConfig:
|
|
817
|
+
action_service = action_dict.setdefault("service", self.get_service())
|
|
818
|
+
action_deploy_group = action_dict.setdefault(
|
|
819
|
+
"deploy_group", self.get_deploy_group()
|
|
820
|
+
)
|
|
821
|
+
if action_service and action_deploy_group and self.load_deployments:
|
|
822
|
+
try:
|
|
823
|
+
deployments_json = load_v2_deployments_json(
|
|
824
|
+
service=action_service, soa_dir=self.soa_dir
|
|
825
|
+
)
|
|
826
|
+
branch_dict = {
|
|
827
|
+
"docker_image": deployments_json.get_docker_image_for_deploy_group(
|
|
828
|
+
action_deploy_group
|
|
829
|
+
),
|
|
830
|
+
"git_sha": deployments_json.get_git_sha_for_deploy_group(
|
|
831
|
+
action_deploy_group
|
|
832
|
+
),
|
|
833
|
+
"image_version": deployments_json.get_image_version_for_deploy_group(
|
|
834
|
+
action_deploy_group
|
|
835
|
+
),
|
|
836
|
+
# TODO: add Tron instances when generating deployments json
|
|
837
|
+
"desired_state": "start",
|
|
838
|
+
"force_bounce": None,
|
|
839
|
+
}
|
|
840
|
+
except NoDeploymentsAvailable:
|
|
841
|
+
log.warning(
|
|
842
|
+
f'Docker image unavailable for {action_service}.{self.get_name()}.{action_dict.get("name")}'
|
|
843
|
+
" is it deployed yet?"
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
if self.soa_dir != DEFAULT_SOA_DIR:
|
|
847
|
+
log.warning(
|
|
848
|
+
f"Error: No deployments.json found in {self.soa_dir}/{action_service}. "
|
|
849
|
+
"You can generate this by running: "
|
|
850
|
+
f"generate_deployments_for_service -d {self.soa_dir} -s {action_service}"
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
branch_dict = None
|
|
854
|
+
else:
|
|
855
|
+
branch_dict = None
|
|
856
|
+
action_dict["monitoring"] = self.get_monitoring()
|
|
857
|
+
|
|
858
|
+
cluster_override = _get_tron_k8s_cluster_override(self.get_cluster())
|
|
859
|
+
return TronActionConfig(
|
|
860
|
+
service=action_service,
|
|
861
|
+
instance=compose_instance(self.get_name(), action_name),
|
|
862
|
+
cluster=cluster_override or self.get_cluster(),
|
|
863
|
+
config_dict=action_dict,
|
|
864
|
+
branch_dict=branch_dict,
|
|
865
|
+
soa_dir=self.soa_dir,
|
|
866
|
+
for_validation=self.for_validation,
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
def get_actions(self) -> List[TronActionConfig]:
|
|
870
|
+
actions = self.config_dict.get("actions")
|
|
871
|
+
return [
|
|
872
|
+
self._get_action_config(name, action_dict)
|
|
873
|
+
for name, action_dict in actions.items()
|
|
874
|
+
]
|
|
875
|
+
|
|
876
|
+
def get_cleanup_action(self):
|
|
877
|
+
action_dict = self.config_dict.get("cleanup_action")
|
|
878
|
+
if not action_dict:
|
|
879
|
+
return None
|
|
880
|
+
|
|
881
|
+
# TODO: we should keep this trickery outside paasta repo
|
|
882
|
+
return self._get_action_config("cleanup", action_dict)
|
|
883
|
+
|
|
884
|
+
def check_monitoring(self) -> Tuple[bool, str]:
|
|
885
|
+
monitoring = self.get_monitoring()
|
|
886
|
+
valid_teams = list_teams()
|
|
887
|
+
if monitoring is not None:
|
|
888
|
+
team_name = monitoring.get("team", None)
|
|
889
|
+
if team_name is None:
|
|
890
|
+
return False, "Team name is required for monitoring"
|
|
891
|
+
elif team_name not in valid_teams:
|
|
892
|
+
suggest_teams = difflib.get_close_matches(
|
|
893
|
+
word=team_name, possibilities=valid_teams
|
|
894
|
+
)
|
|
895
|
+
return (
|
|
896
|
+
False,
|
|
897
|
+
f"Invalid team name: {team_name}. Do you mean one of these: {suggest_teams}",
|
|
898
|
+
)
|
|
899
|
+
return True, ""
|
|
900
|
+
|
|
901
|
+
def check_actions(self) -> Tuple[bool, List[str]]:
|
|
902
|
+
actions = self.get_actions()
|
|
903
|
+
cleanup_action = self.get_cleanup_action()
|
|
904
|
+
if cleanup_action:
|
|
905
|
+
actions.append(cleanup_action)
|
|
906
|
+
|
|
907
|
+
checks_passed = True
|
|
908
|
+
msgs: List[str] = []
|
|
909
|
+
for action in actions:
|
|
910
|
+
action_msgs = action.validate()
|
|
911
|
+
if action_msgs:
|
|
912
|
+
checks_passed = False
|
|
913
|
+
msgs.extend(action_msgs)
|
|
914
|
+
return checks_passed, msgs
|
|
915
|
+
|
|
916
|
+
def validate(self) -> List[str]:
|
|
917
|
+
_, error_msgs = self.check_actions()
|
|
918
|
+
checks = ["check_monitoring"]
|
|
919
|
+
for check in checks:
|
|
920
|
+
check_passed, check_msg = getattr(self, check)()
|
|
921
|
+
if not check_passed:
|
|
922
|
+
error_msgs.append(check_msg)
|
|
923
|
+
return error_msgs
|
|
924
|
+
|
|
925
|
+
def __eq__(self, other):
|
|
926
|
+
if isinstance(other, type(self)):
|
|
927
|
+
return self.config_dict == other.config_dict
|
|
928
|
+
return False
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def format_volumes(paasta_volume_list):
|
|
932
|
+
return [
|
|
933
|
+
{
|
|
934
|
+
"container_path": v["containerPath"],
|
|
935
|
+
"host_path": v["hostPath"],
|
|
936
|
+
"mode": v["mode"],
|
|
937
|
+
}
|
|
938
|
+
for v in paasta_volume_list
|
|
939
|
+
]
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def format_master_config(master_config, default_volumes, dockercfg_location):
|
|
943
|
+
mesos_options = master_config.get("mesos_options", {})
|
|
944
|
+
mesos_options.update(
|
|
945
|
+
{
|
|
946
|
+
"default_volumes": format_volumes(default_volumes),
|
|
947
|
+
"dockercfg_location": dockercfg_location,
|
|
948
|
+
}
|
|
949
|
+
)
|
|
950
|
+
master_config["mesos_options"] = mesos_options
|
|
951
|
+
|
|
952
|
+
k8s_options = master_config.get("k8s_options", {})
|
|
953
|
+
if k8s_options:
|
|
954
|
+
# Only add default volumes if we already have k8s_options
|
|
955
|
+
k8s_options.update(
|
|
956
|
+
{
|
|
957
|
+
"default_volumes": format_volumes(default_volumes),
|
|
958
|
+
}
|
|
959
|
+
)
|
|
960
|
+
master_config["k8s_options"] = k8s_options
|
|
961
|
+
return master_config
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def format_tron_action_dict(action_config: TronActionConfig):
|
|
965
|
+
"""Generate a dict of tronfig for an action, from the TronActionConfig.
|
|
966
|
+
|
|
967
|
+
:param action_config: TronActionConfig
|
|
968
|
+
"""
|
|
969
|
+
executor = action_config.get_executor()
|
|
970
|
+
result = {
|
|
971
|
+
"command": action_config.get_cmd(),
|
|
972
|
+
"executor": executor,
|
|
973
|
+
"requires": action_config.get_requires(),
|
|
974
|
+
"node": action_config.get_node(),
|
|
975
|
+
"retries": action_config.get_retries(),
|
|
976
|
+
"retries_delay": action_config.get_retries_delay(),
|
|
977
|
+
"secret_volumes": action_config.get_secret_volumes(),
|
|
978
|
+
"expected_runtime": action_config.get_expected_runtime(),
|
|
979
|
+
"trigger_downstreams": action_config.get_trigger_downstreams(),
|
|
980
|
+
"triggered_by": action_config.get_triggered_by(),
|
|
981
|
+
"on_upstream_rerun": action_config.get_on_upstream_rerun(),
|
|
982
|
+
"trigger_timeout": action_config.get_trigger_timeout(),
|
|
983
|
+
# outside of Spark use-cases, we also allow users to specify an expected-to-exist Service Account name
|
|
984
|
+
# in the Tron namespace in case an action needs specific k8s permissions (e.g., a Jolt batch may need
|
|
985
|
+
# k8s permissions to list Jolt pods in the jolt namespace to do science™ to them).
|
|
986
|
+
# if the provided Service Account does not exist, Tron should simply fail to create the Podspec and report
|
|
987
|
+
# a failure
|
|
988
|
+
# NOTE: this will get overridden if an action specifies Pod Identity configs
|
|
989
|
+
"service_account_name": action_config.get_service_account_name(),
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
# we need this loaded in several branches, so we'll load it once at the start to simplify things
|
|
993
|
+
system_paasta_config = load_system_paasta_config()
|
|
994
|
+
|
|
995
|
+
if executor in KUBERNETES_EXECUTOR_NAMES:
|
|
996
|
+
# we'd like Tron to be able to distinguish between spark and normal actions
|
|
997
|
+
# even though they both run on k8s
|
|
998
|
+
result["executor"] = EXECUTOR_NAME_TO_TRON_EXECUTOR_TYPE.get(
|
|
999
|
+
executor, "kubernetes"
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
result["secret_env"] = action_config.get_secret_env()
|
|
1003
|
+
result["field_selector_env"] = action_config.get_field_selector_env()
|
|
1004
|
+
all_env = action_config.get_env()
|
|
1005
|
+
# For k8s, we do not want secret envvars to be duplicated in both `env` and `secret_env`
|
|
1006
|
+
# or for field selector env vars to be overwritten
|
|
1007
|
+
result["env"] = {
|
|
1008
|
+
k: v
|
|
1009
|
+
for k, v in all_env.items()
|
|
1010
|
+
if not is_secret_ref(v) and k not in result["field_selector_env"]
|
|
1011
|
+
}
|
|
1012
|
+
result["env"]["ENABLE_PER_INSTANCE_LOGSPOUT"] = "1"
|
|
1013
|
+
result["node_selectors"] = action_config.get_node_selectors()
|
|
1014
|
+
result["node_affinities"] = action_config.get_node_affinities()
|
|
1015
|
+
|
|
1016
|
+
if system_paasta_config.get_enable_tron_tsc():
|
|
1017
|
+
# XXX: this is currently hardcoded since we should only really need TSC for zone-aware scheduling
|
|
1018
|
+
result["topology_spread_constraints"] = [
|
|
1019
|
+
{
|
|
1020
|
+
# try to evenly spread pods across specified topology
|
|
1021
|
+
"max_skew": 1,
|
|
1022
|
+
# narrow down what pods to consider when spreading
|
|
1023
|
+
"label_selector": {
|
|
1024
|
+
# only consider pods that are managed by tron
|
|
1025
|
+
"app.kubernetes.io/managed-by": "tron",
|
|
1026
|
+
# and in the same pool
|
|
1027
|
+
"paasta.yelp.com/pool": action_config.get_pool(),
|
|
1028
|
+
},
|
|
1029
|
+
# now, spread across AZs
|
|
1030
|
+
"topology_key": "topology.kubernetes.io/zone",
|
|
1031
|
+
# but if not possible, schedule even with a zonal imbalance
|
|
1032
|
+
"when_unsatisfiable": "ScheduleAnyway",
|
|
1033
|
+
},
|
|
1034
|
+
]
|
|
1035
|
+
|
|
1036
|
+
# XXX: once we're off mesos we can make get_cap_* return just the cap names as a list
|
|
1037
|
+
result["cap_add"] = [cap["value"] for cap in action_config.get_cap_add()]
|
|
1038
|
+
result["cap_drop"] = [cap["value"] for cap in action_config.get_cap_drop()]
|
|
1039
|
+
|
|
1040
|
+
result["labels"] = {
|
|
1041
|
+
"paasta.yelp.com/cluster": action_config.get_cluster(),
|
|
1042
|
+
"paasta.yelp.com/pool": action_config.get_pool(),
|
|
1043
|
+
"paasta.yelp.com/service": action_config.get_service(),
|
|
1044
|
+
"paasta.yelp.com/instance": limit_size_with_hash(
|
|
1045
|
+
action_config.get_instance(),
|
|
1046
|
+
limit=63,
|
|
1047
|
+
suffix=4,
|
|
1048
|
+
),
|
|
1049
|
+
# XXX: should this be different for Spark drivers launched by Tron?
|
|
1050
|
+
"app.kubernetes.io/managed-by": "tron",
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
result["annotations"] = {
|
|
1054
|
+
# we can hardcode this for now as batches really shouldn't
|
|
1055
|
+
# need routable IPs and we know that Spark does.
|
|
1056
|
+
"paasta.yelp.com/routable_ip": "true" if executor == "spark" else "false",
|
|
1057
|
+
# we have a large amount of tron pods whose instance names are too long for a k8s label
|
|
1058
|
+
# ...so let's toss them into an annotation so that tooling can read them (since the length
|
|
1059
|
+
# limit is much higher (256kb))
|
|
1060
|
+
"paasta.yelp.com/service": action_config.get_service(),
|
|
1061
|
+
"paasta.yelp.com/instance": action_config.get_instance(),
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
result["labels"]["yelp.com/owner"] = "compute_infra_platform_experience"
|
|
1065
|
+
|
|
1066
|
+
if (
|
|
1067
|
+
action_config.get_iam_role_provider() == "aws"
|
|
1068
|
+
and action_config.get_iam_role()
|
|
1069
|
+
):
|
|
1070
|
+
# this service account will be used for normal Tron batches as well as for Spark drivers
|
|
1071
|
+
result["service_account_name"] = get_service_account_name(
|
|
1072
|
+
iam_role=action_config.get_iam_role(),
|
|
1073
|
+
k8s_role=None,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
# service account token volumes for service authentication
|
|
1077
|
+
result["projected_sa_volumes"] = action_config.get_projected_sa_volumes()
|
|
1078
|
+
|
|
1079
|
+
# XXX: now that we're actually passing through extra_volumes correctly (e.g., using get_volumes()),
|
|
1080
|
+
# we can get rid of the default_volumes from the Tron master config
|
|
1081
|
+
extra_volumes = action_config.get_volumes(
|
|
1082
|
+
system_paasta_config.get_volumes(),
|
|
1083
|
+
)
|
|
1084
|
+
if executor == "spark":
|
|
1085
|
+
is_mrjob = action_config.config_dict.get("mrjob", False)
|
|
1086
|
+
# inject additional Spark configs in case of Spark commands
|
|
1087
|
+
result["command"] = spark_tools.build_spark_command(
|
|
1088
|
+
result["command"],
|
|
1089
|
+
action_config.action_spark_config,
|
|
1090
|
+
is_mrjob,
|
|
1091
|
+
action_config.config_dict.get(
|
|
1092
|
+
"max_runtime", spark_tools.DEFAULT_SPARK_RUNTIME_TIMEOUT
|
|
1093
|
+
),
|
|
1094
|
+
silent=True,
|
|
1095
|
+
)
|
|
1096
|
+
# point to the KUBECONFIG needed by Spark driver
|
|
1097
|
+
result["env"]["KUBECONFIG"] = system_paasta_config.get_spark_kubeconfig()
|
|
1098
|
+
|
|
1099
|
+
# spark, unlike normal batches, needs to expose several ports for things like the spark
|
|
1100
|
+
# ui and for executor->driver communication
|
|
1101
|
+
result["ports"] = list(
|
|
1102
|
+
set(
|
|
1103
|
+
spark_tools.get_spark_ports_from_config(
|
|
1104
|
+
action_config.action_spark_config
|
|
1105
|
+
)
|
|
1106
|
+
)
|
|
1107
|
+
)
|
|
1108
|
+
# mount KUBECONFIG file for Spark drivers to communicate with EKS cluster
|
|
1109
|
+
extra_volumes.append(
|
|
1110
|
+
DockerVolume(
|
|
1111
|
+
{
|
|
1112
|
+
"containerPath": system_paasta_config.get_spark_kubeconfig(),
|
|
1113
|
+
"hostPath": system_paasta_config.get_spark_kubeconfig(),
|
|
1114
|
+
"mode": "RO",
|
|
1115
|
+
}
|
|
1116
|
+
)
|
|
1117
|
+
)
|
|
1118
|
+
# Add pod annotations and labels for Spark monitoring metrics
|
|
1119
|
+
monitoring_annotations = (
|
|
1120
|
+
spark_tools.get_spark_driver_monitoring_annotations(
|
|
1121
|
+
action_config.action_spark_config
|
|
1122
|
+
)
|
|
1123
|
+
)
|
|
1124
|
+
monitoring_labels = spark_tools.get_spark_driver_monitoring_labels(
|
|
1125
|
+
action_config.action_spark_config,
|
|
1126
|
+
user=spark_tools.SPARK_TRON_JOB_USER,
|
|
1127
|
+
)
|
|
1128
|
+
result["annotations"].update(monitoring_annotations)
|
|
1129
|
+
result["labels"].update(monitoring_labels)
|
|
1130
|
+
|
|
1131
|
+
elif executor in MESOS_EXECUTOR_NAMES:
|
|
1132
|
+
result["executor"] = "mesos"
|
|
1133
|
+
constraint_labels = ["attribute", "operator", "value"]
|
|
1134
|
+
result["constraints"] = [
|
|
1135
|
+
dict(zip(constraint_labels, constraint))
|
|
1136
|
+
for constraint in action_config.get_calculated_constraints()
|
|
1137
|
+
]
|
|
1138
|
+
result["docker_parameters"] = [
|
|
1139
|
+
{"key": param["key"], "value": param["value"]}
|
|
1140
|
+
for param in action_config.format_docker_parameters()
|
|
1141
|
+
]
|
|
1142
|
+
result["env"] = action_config.get_env()
|
|
1143
|
+
|
|
1144
|
+
# the following config is only valid for k8s/Mesos since we're not running SSH actions
|
|
1145
|
+
# in a containerized fashion
|
|
1146
|
+
if executor in (KUBERNETES_EXECUTOR_NAMES + MESOS_EXECUTOR_NAMES):
|
|
1147
|
+
result["cpus"] = action_config.get_cpus()
|
|
1148
|
+
result["mem"] = action_config.get_mem()
|
|
1149
|
+
result["disk"] = action_config.get_disk()
|
|
1150
|
+
result["extra_volumes"] = format_volumes(extra_volumes)
|
|
1151
|
+
result["docker_image"] = action_config.get_docker_url()
|
|
1152
|
+
|
|
1153
|
+
# Only pass non-None values, so Tron will use defaults for others
|
|
1154
|
+
return {key: val for key, val in result.items() if val is not None}
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
def format_tron_job_dict(job_config: TronJobConfig, k8s_enabled: bool = False):
|
|
1158
|
+
"""Generate a dict of tronfig for a job, from the TronJobConfig.
|
|
1159
|
+
|
|
1160
|
+
:param job_config: TronJobConfig
|
|
1161
|
+
"""
|
|
1162
|
+
action_dict = {
|
|
1163
|
+
action_config.get_action_name(): format_tron_action_dict(
|
|
1164
|
+
action_config=action_config,
|
|
1165
|
+
)
|
|
1166
|
+
for action_config in job_config.get_actions()
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
result = {
|
|
1170
|
+
"node": job_config.get_node(),
|
|
1171
|
+
"schedule": job_config.get_schedule(),
|
|
1172
|
+
"actions": action_dict,
|
|
1173
|
+
"monitoring": job_config.get_monitoring(),
|
|
1174
|
+
"queueing": job_config.get_queueing(),
|
|
1175
|
+
"run_limit": job_config.get_run_limit(),
|
|
1176
|
+
"all_nodes": job_config.get_all_nodes(),
|
|
1177
|
+
"enabled": job_config.get_enabled(),
|
|
1178
|
+
"allow_overlap": job_config.get_allow_overlap(),
|
|
1179
|
+
"max_runtime": job_config.get_max_runtime(),
|
|
1180
|
+
"time_zone": job_config.get_time_zone(),
|
|
1181
|
+
"expected_runtime": job_config.get_expected_runtime(),
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
cleanup_config = job_config.get_cleanup_action()
|
|
1185
|
+
if cleanup_config:
|
|
1186
|
+
cleanup_action = format_tron_action_dict(
|
|
1187
|
+
action_config=cleanup_config,
|
|
1188
|
+
)
|
|
1189
|
+
result["cleanup_action"] = cleanup_action
|
|
1190
|
+
|
|
1191
|
+
# Only pass non-None values, so Tron will use defaults for others
|
|
1192
|
+
return {key: val for key, val in result.items() if val is not None}
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
def load_tron_instance_config(
|
|
1196
|
+
service: str,
|
|
1197
|
+
instance: str,
|
|
1198
|
+
cluster: str,
|
|
1199
|
+
load_deployments: bool = True,
|
|
1200
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
1201
|
+
) -> TronActionConfig:
|
|
1202
|
+
for action in load_tron_instance_configs(
|
|
1203
|
+
service=service,
|
|
1204
|
+
cluster=cluster,
|
|
1205
|
+
load_deployments=load_deployments,
|
|
1206
|
+
soa_dir=soa_dir,
|
|
1207
|
+
):
|
|
1208
|
+
if action.get_instance() == instance:
|
|
1209
|
+
return action
|
|
1210
|
+
raise NoConfigurationForServiceError(
|
|
1211
|
+
f"No tron configuration found for {service} {instance}"
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
@time_cache(ttl=5)
|
|
1216
|
+
def load_tron_instance_configs(
|
|
1217
|
+
service: str,
|
|
1218
|
+
cluster: str,
|
|
1219
|
+
load_deployments: bool = True,
|
|
1220
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
1221
|
+
) -> Tuple[TronActionConfig, ...]:
|
|
1222
|
+
ret: List[TronActionConfig] = []
|
|
1223
|
+
|
|
1224
|
+
jobs = load_tron_service_config(
|
|
1225
|
+
service=service,
|
|
1226
|
+
cluster=cluster,
|
|
1227
|
+
load_deployments=load_deployments,
|
|
1228
|
+
soa_dir=soa_dir,
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
for job in jobs:
|
|
1232
|
+
ret.extend(job.get_actions())
|
|
1233
|
+
|
|
1234
|
+
return tuple(ret)
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
@time_cache(ttl=5)
|
|
1238
|
+
def load_tron_service_config(
|
|
1239
|
+
service,
|
|
1240
|
+
cluster,
|
|
1241
|
+
load_deployments=True,
|
|
1242
|
+
soa_dir=DEFAULT_SOA_DIR,
|
|
1243
|
+
for_validation=False,
|
|
1244
|
+
):
|
|
1245
|
+
return load_tron_service_config_no_cache(
|
|
1246
|
+
service,
|
|
1247
|
+
cluster,
|
|
1248
|
+
load_deployments,
|
|
1249
|
+
soa_dir,
|
|
1250
|
+
for_validation,
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
|
|
1254
|
+
def load_tron_service_config_no_cache(
|
|
1255
|
+
service,
|
|
1256
|
+
cluster,
|
|
1257
|
+
load_deployments=True,
|
|
1258
|
+
soa_dir=DEFAULT_SOA_DIR,
|
|
1259
|
+
for_validation=False,
|
|
1260
|
+
):
|
|
1261
|
+
"""Load all configured jobs for a service, and any additional config values."""
|
|
1262
|
+
config = read_extra_service_information(
|
|
1263
|
+
service_name=service, extra_info=f"tron-{cluster}", soa_dir=soa_dir
|
|
1264
|
+
)
|
|
1265
|
+
jobs = filter_templates_from_config(config)
|
|
1266
|
+
job_configs = [
|
|
1267
|
+
TronJobConfig(
|
|
1268
|
+
name=name,
|
|
1269
|
+
service=service,
|
|
1270
|
+
cluster=cluster,
|
|
1271
|
+
config_dict=job,
|
|
1272
|
+
load_deployments=load_deployments,
|
|
1273
|
+
soa_dir=soa_dir,
|
|
1274
|
+
for_validation=for_validation,
|
|
1275
|
+
)
|
|
1276
|
+
for name, job in jobs.items()
|
|
1277
|
+
]
|
|
1278
|
+
return job_configs
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def create_complete_master_config(cluster, soa_dir=DEFAULT_SOA_DIR):
|
|
1282
|
+
system_paasta_config = load_system_paasta_config()
|
|
1283
|
+
tronfig_folder = get_tronfig_folder(soa_dir=soa_dir, cluster=cluster)
|
|
1284
|
+
config = read_yaml_file(os.path.join(tronfig_folder, f"MASTER.yaml"))
|
|
1285
|
+
master_config = format_master_config(
|
|
1286
|
+
config,
|
|
1287
|
+
system_paasta_config.get_volumes(),
|
|
1288
|
+
system_paasta_config.get_dockercfg_location(),
|
|
1289
|
+
)
|
|
1290
|
+
return yaml.dump(master_config, Dumper=Dumper, default_flow_style=False)
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
def create_complete_config(
|
|
1294
|
+
service: str,
|
|
1295
|
+
cluster: str,
|
|
1296
|
+
soa_dir: str = DEFAULT_SOA_DIR,
|
|
1297
|
+
k8s_enabled: bool = False,
|
|
1298
|
+
dry_run: bool = False,
|
|
1299
|
+
):
|
|
1300
|
+
"""Generate a namespace configuration file for Tron, for a service."""
|
|
1301
|
+
job_configs = load_tron_service_config(
|
|
1302
|
+
service=service,
|
|
1303
|
+
cluster=cluster,
|
|
1304
|
+
load_deployments=True,
|
|
1305
|
+
soa_dir=soa_dir,
|
|
1306
|
+
for_validation=dry_run,
|
|
1307
|
+
)
|
|
1308
|
+
preproccessed_config = {}
|
|
1309
|
+
preproccessed_config["jobs"] = {
|
|
1310
|
+
job_config.get_name(): format_tron_job_dict(
|
|
1311
|
+
job_config=job_config, k8s_enabled=k8s_enabled
|
|
1312
|
+
)
|
|
1313
|
+
for job_config in job_configs
|
|
1314
|
+
}
|
|
1315
|
+
return yaml.dump(preproccessed_config, Dumper=Dumper, default_flow_style=False)
|
|
1316
|
+
|
|
1317
|
+
|
|
1318
|
+
def validate_complete_config(
|
|
1319
|
+
service: str, cluster: str, soa_dir: str = DEFAULT_SOA_DIR
|
|
1320
|
+
) -> List[str]:
|
|
1321
|
+
job_configs = load_tron_service_config(
|
|
1322
|
+
service=service,
|
|
1323
|
+
cluster=cluster,
|
|
1324
|
+
load_deployments=False,
|
|
1325
|
+
soa_dir=soa_dir,
|
|
1326
|
+
for_validation=True,
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
# PaaSTA-specific validation
|
|
1330
|
+
for job_config in job_configs:
|
|
1331
|
+
check_msgs = job_config.validate()
|
|
1332
|
+
if check_msgs:
|
|
1333
|
+
return check_msgs
|
|
1334
|
+
|
|
1335
|
+
master_config_path = os.path.join(
|
|
1336
|
+
os.path.abspath(soa_dir), "tron", cluster, MASTER_NAMESPACE + ".yaml"
|
|
1337
|
+
)
|
|
1338
|
+
|
|
1339
|
+
# TODO: remove creating the master config here once we're fully off of mesos
|
|
1340
|
+
# since we only have it here to verify that the generated tronfig will be valid
|
|
1341
|
+
# given that the kill-switch will affect PaaSTA's setup_tron_namespace script (we're
|
|
1342
|
+
# not reading the kill-switch in Tron since it's not easily accessible at the point
|
|
1343
|
+
# at which we'd like to fallback to Mesos if toggled)
|
|
1344
|
+
master_config = yaml.safe_load(
|
|
1345
|
+
create_complete_master_config(cluster=cluster, soa_dir=soa_dir)
|
|
1346
|
+
)
|
|
1347
|
+
k8s_enabled_for_cluster = master_config.get("k8s_options", {}).get("enabled", False)
|
|
1348
|
+
|
|
1349
|
+
preproccessed_config = {}
|
|
1350
|
+
# Use Tronfig on generated config from PaaSTA to validate the rest
|
|
1351
|
+
preproccessed_config["jobs"] = {
|
|
1352
|
+
job_config.get_name(): format_tron_job_dict(
|
|
1353
|
+
job_config=job_config, k8s_enabled=k8s_enabled_for_cluster
|
|
1354
|
+
)
|
|
1355
|
+
for job_config in job_configs
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
complete_config = yaml.dump(preproccessed_config, Dumper=Dumper)
|
|
1359
|
+
|
|
1360
|
+
proc = subprocess.run(
|
|
1361
|
+
["tronfig", "-", "-V", "-n", service, "-m", master_config_path],
|
|
1362
|
+
input=complete_config,
|
|
1363
|
+
stdout=subprocess.PIPE,
|
|
1364
|
+
stderr=subprocess.PIPE,
|
|
1365
|
+
encoding="utf-8",
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
if proc.returncode != 0:
|
|
1369
|
+
process_errors = proc.stderr.strip()
|
|
1370
|
+
if process_errors: # Error running tronfig
|
|
1371
|
+
print(proc.stderr)
|
|
1372
|
+
return [proc.stdout.strip()]
|
|
1373
|
+
|
|
1374
|
+
return []
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
def _is_valid_namespace(job: Any, tron_executors: List[str]) -> bool:
|
|
1378
|
+
for action_info in job.get("actions", {}).values():
|
|
1379
|
+
if action_info.get("executor", "paasta") in tron_executors:
|
|
1380
|
+
return True
|
|
1381
|
+
return False
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
def get_tron_namespaces(
|
|
1385
|
+
cluster: str,
|
|
1386
|
+
soa_dir: str,
|
|
1387
|
+
tron_executors: List[str] = EXECUTOR_TYPES,
|
|
1388
|
+
) -> List[str]:
|
|
1389
|
+
tron_config_file = f"tron-{cluster}.yaml"
|
|
1390
|
+
config_dirs = [
|
|
1391
|
+
_dir[0]
|
|
1392
|
+
for _dir in os.walk(os.path.abspath(soa_dir))
|
|
1393
|
+
if tron_config_file in _dir[2]
|
|
1394
|
+
]
|
|
1395
|
+
namespaces = [os.path.split(config_dir)[1] for config_dir in config_dirs]
|
|
1396
|
+
tron_namespaces = set()
|
|
1397
|
+
for namespace in namespaces:
|
|
1398
|
+
config = filter_templates_from_config(
|
|
1399
|
+
read_extra_service_information(
|
|
1400
|
+
namespace,
|
|
1401
|
+
extra_info=f"tron-{cluster}",
|
|
1402
|
+
soa_dir=soa_dir,
|
|
1403
|
+
deepcopy=False,
|
|
1404
|
+
)
|
|
1405
|
+
)
|
|
1406
|
+
for job in config.values():
|
|
1407
|
+
if _is_valid_namespace(job, tron_executors):
|
|
1408
|
+
tron_namespaces.add(namespace)
|
|
1409
|
+
break
|
|
1410
|
+
return list(tron_namespaces)
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
def list_tron_clusters(service: str, soa_dir: str = DEFAULT_SOA_DIR) -> List[str]:
|
|
1414
|
+
"""Returns the Tron clusters a service is configured to deploy to."""
|
|
1415
|
+
search_re = r"/tron-([0-9a-z-_]*)\.yaml$"
|
|
1416
|
+
service_dir = os.path.join(soa_dir, service)
|
|
1417
|
+
clusters = []
|
|
1418
|
+
for filename in glob.glob(f"{service_dir}/*.yaml"):
|
|
1419
|
+
cluster_re_match = re.search(search_re, filename)
|
|
1420
|
+
if cluster_re_match is not None:
|
|
1421
|
+
clusters.append(cluster_re_match.group(1))
|
|
1422
|
+
return clusters
|
|
1423
|
+
|
|
1424
|
+
|
|
1425
|
+
def get_tron_dashboard_for_cluster(cluster: str):
|
|
1426
|
+
dashboards = load_system_paasta_config().get_dashboard_links()[cluster]
|
|
1427
|
+
if "Tron" not in dashboards:
|
|
1428
|
+
raise Exception(f"tron api endpoint is not defined for cluster {cluster}")
|
|
1429
|
+
return dashboards["Tron"]
|
|
1430
|
+
|
|
1431
|
+
|
|
1432
|
+
def tron_jobs_running_here() -> List[Tuple[str, str, int]]:
|
|
1433
|
+
return mesos_services_running_here(
|
|
1434
|
+
framework_filter=lambda fw: fw["name"].startswith("tron"),
|
|
1435
|
+
parse_service_instance_from_executor_id=parse_service_instance_from_executor_id,
|
|
1436
|
+
)
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
def parse_service_instance_from_executor_id(task_id: str) -> Tuple[str, str]:
|
|
1440
|
+
"""Parses tron mesos task ids, like schematizer.traffic_generator.28414.turnstyle.46da87d7-6092-4ed4-b926-ffa7b21c7785"""
|
|
1441
|
+
try:
|
|
1442
|
+
service, job, job_run, action, uuid = task_id.split(".")
|
|
1443
|
+
except Exception as e:
|
|
1444
|
+
log.warning(
|
|
1445
|
+
f"Couldn't parse the mesos task id into a valid tron job: {task_id}: {e}"
|
|
1446
|
+
)
|
|
1447
|
+
service, job, action = "unknown_service", "unknown_job", "unknown_action"
|
|
1448
|
+
return service, f"{job}.{action}"
|