paasta-tools 1.21.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_itests/__init__.py +0 -0
- k8s_itests/test_autoscaling.py +23 -0
- k8s_itests/utils.py +38 -0
- paasta_tools/__init__.py +20 -0
- paasta_tools/adhoc_tools.py +142 -0
- paasta_tools/api/__init__.py +13 -0
- paasta_tools/api/api.py +330 -0
- paasta_tools/api/api_docs/swagger.json +2323 -0
- paasta_tools/api/client.py +106 -0
- paasta_tools/api/settings.py +33 -0
- paasta_tools/api/tweens/__init__.py +6 -0
- paasta_tools/api/tweens/auth.py +125 -0
- paasta_tools/api/tweens/profiling.py +108 -0
- paasta_tools/api/tweens/request_logger.py +124 -0
- paasta_tools/api/views/__init__.py +13 -0
- paasta_tools/api/views/autoscaler.py +100 -0
- paasta_tools/api/views/exception.py +45 -0
- paasta_tools/api/views/flink.py +73 -0
- paasta_tools/api/views/instance.py +395 -0
- paasta_tools/api/views/pause_autoscaler.py +71 -0
- paasta_tools/api/views/remote_run.py +113 -0
- paasta_tools/api/views/resources.py +76 -0
- paasta_tools/api/views/service.py +35 -0
- paasta_tools/api/views/version.py +25 -0
- paasta_tools/apply_external_resources.py +79 -0
- paasta_tools/async_utils.py +109 -0
- paasta_tools/autoscaling/__init__.py +0 -0
- paasta_tools/autoscaling/autoscaling_service_lib.py +57 -0
- paasta_tools/autoscaling/forecasting.py +106 -0
- paasta_tools/autoscaling/max_all_k8s_services.py +41 -0
- paasta_tools/autoscaling/pause_service_autoscaler.py +77 -0
- paasta_tools/autoscaling/utils.py +52 -0
- paasta_tools/bounce_lib.py +184 -0
- paasta_tools/broadcast_log_to_services.py +62 -0
- paasta_tools/cassandracluster_tools.py +210 -0
- paasta_tools/check_autoscaler_max_instances.py +212 -0
- paasta_tools/check_cassandracluster_services_replication.py +35 -0
- paasta_tools/check_flink_services_health.py +203 -0
- paasta_tools/check_kubernetes_api.py +57 -0
- paasta_tools/check_kubernetes_services_replication.py +141 -0
- paasta_tools/check_oom_events.py +244 -0
- paasta_tools/check_services_replication_tools.py +324 -0
- paasta_tools/check_spark_jobs.py +234 -0
- paasta_tools/cleanup_kubernetes_cr.py +138 -0
- paasta_tools/cleanup_kubernetes_crd.py +145 -0
- paasta_tools/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools/cleanup_tron_namespaces.py +96 -0
- paasta_tools/cli/__init__.py +13 -0
- paasta_tools/cli/authentication.py +85 -0
- paasta_tools/cli/cli.py +260 -0
- paasta_tools/cli/cmds/__init__.py +13 -0
- paasta_tools/cli/cmds/autoscale.py +143 -0
- paasta_tools/cli/cmds/check.py +334 -0
- paasta_tools/cli/cmds/cook_image.py +147 -0
- paasta_tools/cli/cmds/get_docker_image.py +76 -0
- paasta_tools/cli/cmds/get_image_version.py +172 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +93 -0
- paasta_tools/cli/cmds/info.py +155 -0
- paasta_tools/cli/cmds/itest.py +117 -0
- paasta_tools/cli/cmds/list.py +66 -0
- paasta_tools/cli/cmds/list_clusters.py +42 -0
- paasta_tools/cli/cmds/list_deploy_queue.py +171 -0
- paasta_tools/cli/cmds/list_namespaces.py +84 -0
- paasta_tools/cli/cmds/local_run.py +1396 -0
- paasta_tools/cli/cmds/logs.py +1601 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +1988 -0
- paasta_tools/cli/cmds/mesh_status.py +174 -0
- paasta_tools/cli/cmds/pause_service_autoscaler.py +107 -0
- paasta_tools/cli/cmds/push_to_registry.py +275 -0
- paasta_tools/cli/cmds/remote_run.py +252 -0
- paasta_tools/cli/cmds/rollback.py +347 -0
- paasta_tools/cli/cmds/secret.py +549 -0
- paasta_tools/cli/cmds/security_check.py +59 -0
- paasta_tools/cli/cmds/spark_run.py +1400 -0
- paasta_tools/cli/cmds/start_stop_restart.py +401 -0
- paasta_tools/cli/cmds/status.py +2302 -0
- paasta_tools/cli/cmds/validate.py +1012 -0
- paasta_tools/cli/cmds/wait_for_deployment.py +275 -0
- paasta_tools/cli/fsm/__init__.py +13 -0
- paasta_tools/cli/fsm/autosuggest.py +82 -0
- paasta_tools/cli/fsm/template/README.md +8 -0
- paasta_tools/cli/fsm/template/cookiecutter.json +7 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/kubernetes-PROD.yaml +91 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/monitoring.yaml +20 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/service.yaml +8 -0
- paasta_tools/cli/fsm/template/{{cookiecutter.service}}/smartstack.yaml +6 -0
- paasta_tools/cli/fsm_cmd.py +121 -0
- paasta_tools/cli/paasta_tabcomplete.sh +23 -0
- paasta_tools/cli/schemas/adhoc_schema.json +199 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +91 -0
- paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json +37 -0
- paasta_tools/cli/schemas/autotuned_defaults/kubernetes_schema.json +89 -0
- paasta_tools/cli/schemas/deploy_schema.json +173 -0
- paasta_tools/cli/schemas/eks_schema.json +970 -0
- paasta_tools/cli/schemas/kubernetes_schema.json +970 -0
- paasta_tools/cli/schemas/rollback_schema.json +160 -0
- paasta_tools/cli/schemas/service_schema.json +25 -0
- paasta_tools/cli/schemas/smartstack_schema.json +322 -0
- paasta_tools/cli/schemas/tron_schema.json +699 -0
- paasta_tools/cli/utils.py +1118 -0
- paasta_tools/clusterman.py +21 -0
- paasta_tools/config_utils.py +385 -0
- paasta_tools/contrib/__init__.py +0 -0
- paasta_tools/contrib/bounce_log_latency_parser.py +68 -0
- paasta_tools/contrib/check_manual_oapi_changes.sh +24 -0
- paasta_tools/contrib/check_orphans.py +306 -0
- paasta_tools/contrib/create_dynamodb_table.py +35 -0
- paasta_tools/contrib/create_paasta_playground.py +105 -0
- paasta_tools/contrib/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools/contrib/get_running_task_allocation.py +346 -0
- paasta_tools/contrib/habitat_fixer.py +86 -0
- paasta_tools/contrib/ide_helper.py +316 -0
- paasta_tools/contrib/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools/contrib/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools/contrib/kill_bad_containers.py +109 -0
- paasta_tools/contrib/mass-deploy-tag.sh +44 -0
- paasta_tools/contrib/mock_patch_checker.py +86 -0
- paasta_tools/contrib/paasta_update_soa_memcpu.py +520 -0
- paasta_tools/contrib/render_template.py +129 -0
- paasta_tools/contrib/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools/contrib/service_shard_remove.py +157 -0
- paasta_tools/contrib/service_shard_update.py +373 -0
- paasta_tools/contrib/shared_ip_check.py +77 -0
- paasta_tools/contrib/timeouts_metrics_prom.py +64 -0
- paasta_tools/delete_kubernetes_deployments.py +89 -0
- paasta_tools/deployment_utils.py +44 -0
- paasta_tools/docker_wrapper.py +234 -0
- paasta_tools/docker_wrapper_imports.py +13 -0
- paasta_tools/drain_lib.py +351 -0
- paasta_tools/dump_locally_running_services.py +71 -0
- paasta_tools/eks_tools.py +119 -0
- paasta_tools/envoy_tools.py +373 -0
- paasta_tools/firewall.py +504 -0
- paasta_tools/firewall_logging.py +154 -0
- paasta_tools/firewall_update.py +172 -0
- paasta_tools/flink_tools.py +345 -0
- paasta_tools/flinkeks_tools.py +90 -0
- paasta_tools/frameworks/__init__.py +0 -0
- paasta_tools/frameworks/adhoc_scheduler.py +71 -0
- paasta_tools/frameworks/constraints.py +87 -0
- paasta_tools/frameworks/native_scheduler.py +652 -0
- paasta_tools/frameworks/native_service_config.py +301 -0
- paasta_tools/frameworks/task_store.py +245 -0
- paasta_tools/generate_all_deployments +9 -0
- paasta_tools/generate_authenticating_services.py +94 -0
- paasta_tools/generate_deployments_for_service.py +255 -0
- paasta_tools/generate_services_file.py +114 -0
- paasta_tools/generate_services_yaml.py +30 -0
- paasta_tools/hacheck.py +76 -0
- paasta_tools/instance/__init__.py +0 -0
- paasta_tools/instance/hpa_metrics_parser.py +122 -0
- paasta_tools/instance/kubernetes.py +1362 -0
- paasta_tools/iptables.py +240 -0
- paasta_tools/kafkacluster_tools.py +143 -0
- paasta_tools/kubernetes/__init__.py +0 -0
- paasta_tools/kubernetes/application/__init__.py +0 -0
- paasta_tools/kubernetes/application/controller_wrappers.py +476 -0
- paasta_tools/kubernetes/application/tools.py +90 -0
- paasta_tools/kubernetes/bin/__init__.py +0 -0
- paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools/kubernetes/bin/paasta_secrets_sync.py +758 -0
- paasta_tools/kubernetes/remote_run.py +558 -0
- paasta_tools/kubernetes_tools.py +4679 -0
- paasta_tools/list_kubernetes_service_instances.py +128 -0
- paasta_tools/list_tron_namespaces.py +60 -0
- paasta_tools/long_running_service_tools.py +678 -0
- paasta_tools/mac_address.py +44 -0
- paasta_tools/marathon_dashboard.py +0 -0
- paasta_tools/mesos/__init__.py +0 -0
- paasta_tools/mesos/cfg.py +46 -0
- paasta_tools/mesos/cluster.py +60 -0
- paasta_tools/mesos/exceptions.py +59 -0
- paasta_tools/mesos/framework.py +77 -0
- paasta_tools/mesos/log.py +48 -0
- paasta_tools/mesos/master.py +306 -0
- paasta_tools/mesos/mesos_file.py +169 -0
- paasta_tools/mesos/parallel.py +52 -0
- paasta_tools/mesos/slave.py +115 -0
- paasta_tools/mesos/task.py +94 -0
- paasta_tools/mesos/util.py +69 -0
- paasta_tools/mesos/zookeeper.py +37 -0
- paasta_tools/mesos_maintenance.py +848 -0
- paasta_tools/mesos_tools.py +1051 -0
- paasta_tools/metrics/__init__.py +0 -0
- paasta_tools/metrics/metastatus_lib.py +1110 -0
- paasta_tools/metrics/metrics_lib.py +217 -0
- paasta_tools/monitoring/__init__.py +13 -0
- paasta_tools/monitoring/check_k8s_api_performance.py +110 -0
- paasta_tools/monitoring_tools.py +652 -0
- paasta_tools/monkrelaycluster_tools.py +146 -0
- paasta_tools/nrtsearchservice_tools.py +143 -0
- paasta_tools/nrtsearchserviceeks_tools.py +68 -0
- paasta_tools/oom_logger.py +321 -0
- paasta_tools/paasta_deploy_tron_jobs +3 -0
- paasta_tools/paasta_execute_docker_command.py +123 -0
- paasta_tools/paasta_native_serviceinit.py +21 -0
- paasta_tools/paasta_service_config_loader.py +201 -0
- paasta_tools/paastaapi/__init__.py +29 -0
- paasta_tools/paastaapi/api/__init__.py +3 -0
- paasta_tools/paastaapi/api/autoscaler_api.py +302 -0
- paasta_tools/paastaapi/api/default_api.py +569 -0
- paasta_tools/paastaapi/api/remote_run_api.py +604 -0
- paasta_tools/paastaapi/api/resources_api.py +157 -0
- paasta_tools/paastaapi/api/service_api.py +1736 -0
- paasta_tools/paastaapi/api_client.py +818 -0
- paasta_tools/paastaapi/apis/__init__.py +22 -0
- paasta_tools/paastaapi/configuration.py +455 -0
- paasta_tools/paastaapi/exceptions.py +137 -0
- paasta_tools/paastaapi/model/__init__.py +5 -0
- paasta_tools/paastaapi/model/adhoc_launch_history.py +176 -0
- paasta_tools/paastaapi/model/autoscaler_count_msg.py +176 -0
- paasta_tools/paastaapi/model/deploy_queue.py +178 -0
- paasta_tools/paastaapi/model/deploy_queue_service_instance.py +194 -0
- paasta_tools/paastaapi/model/envoy_backend.py +185 -0
- paasta_tools/paastaapi/model/envoy_location.py +184 -0
- paasta_tools/paastaapi/model/envoy_status.py +181 -0
- paasta_tools/paastaapi/model/flink_cluster_overview.py +188 -0
- paasta_tools/paastaapi/model/flink_config.py +173 -0
- paasta_tools/paastaapi/model/flink_job.py +186 -0
- paasta_tools/paastaapi/model/flink_job_details.py +192 -0
- paasta_tools/paastaapi/model/flink_jobs.py +175 -0
- paasta_tools/paastaapi/model/float_and_error.py +173 -0
- paasta_tools/paastaapi/model/hpa_metric.py +176 -0
- paasta_tools/paastaapi/model/inline_object.py +170 -0
- paasta_tools/paastaapi/model/inline_response200.py +170 -0
- paasta_tools/paastaapi/model/inline_response2001.py +170 -0
- paasta_tools/paastaapi/model/instance_bounce_status.py +200 -0
- paasta_tools/paastaapi/model/instance_mesh_status.py +186 -0
- paasta_tools/paastaapi/model/instance_status.py +220 -0
- paasta_tools/paastaapi/model/instance_status_adhoc.py +187 -0
- paasta_tools/paastaapi/model/instance_status_cassandracluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_flink.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kafkacluster.py +173 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes.py +263 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_autoscaling_status.py +187 -0
- paasta_tools/paastaapi/model/instance_status_kubernetes_v2.py +197 -0
- paasta_tools/paastaapi/model/instance_status_tron.py +204 -0
- paasta_tools/paastaapi/model/instance_tasks.py +182 -0
- paasta_tools/paastaapi/model/integer_and_error.py +173 -0
- paasta_tools/paastaapi/model/kubernetes_container.py +178 -0
- paasta_tools/paastaapi/model/kubernetes_container_v2.py +219 -0
- paasta_tools/paastaapi/model/kubernetes_healthcheck.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod.py +201 -0
- paasta_tools/paastaapi/model/kubernetes_pod_event.py +176 -0
- paasta_tools/paastaapi/model/kubernetes_pod_v2.py +213 -0
- paasta_tools/paastaapi/model/kubernetes_replica_set.py +185 -0
- paasta_tools/paastaapi/model/kubernetes_version.py +202 -0
- paasta_tools/paastaapi/model/remote_run_outcome.py +189 -0
- paasta_tools/paastaapi/model/remote_run_start.py +185 -0
- paasta_tools/paastaapi/model/remote_run_stop.py +176 -0
- paasta_tools/paastaapi/model/remote_run_token.py +173 -0
- paasta_tools/paastaapi/model/resource.py +187 -0
- paasta_tools/paastaapi/model/resource_item.py +187 -0
- paasta_tools/paastaapi/model/resource_value.py +176 -0
- paasta_tools/paastaapi/model/smartstack_backend.py +191 -0
- paasta_tools/paastaapi/model/smartstack_location.py +181 -0
- paasta_tools/paastaapi/model/smartstack_status.py +181 -0
- paasta_tools/paastaapi/model/task_tail_lines.py +176 -0
- paasta_tools/paastaapi/model_utils.py +1879 -0
- paasta_tools/paastaapi/models/__init__.py +62 -0
- paasta_tools/paastaapi/rest.py +287 -0
- paasta_tools/prune_completed_pods.py +220 -0
- paasta_tools/puppet_service_tools.py +59 -0
- paasta_tools/py.typed +1 -0
- paasta_tools/remote_git.py +127 -0
- paasta_tools/run-paasta-api-in-dev-mode.py +57 -0
- paasta_tools/run-paasta-api-playground.py +51 -0
- paasta_tools/secret_providers/__init__.py +66 -0
- paasta_tools/secret_providers/vault.py +214 -0
- paasta_tools/secret_tools.py +277 -0
- paasta_tools/setup_istio_mesh.py +353 -0
- paasta_tools/setup_kubernetes_cr.py +412 -0
- paasta_tools/setup_kubernetes_crd.py +138 -0
- paasta_tools/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools/setup_kubernetes_job.py +353 -0
- paasta_tools/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools/setup_tron_namespace.py +248 -0
- paasta_tools/slack.py +75 -0
- paasta_tools/smartstack_tools.py +676 -0
- paasta_tools/spark_tools.py +283 -0
- paasta_tools/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools/tron/__init__.py +0 -0
- paasta_tools/tron/client.py +158 -0
- paasta_tools/tron/tron_command_context.py +194 -0
- paasta_tools/tron/tron_timeutils.py +101 -0
- paasta_tools/tron_tools.py +1448 -0
- paasta_tools/utils.py +4307 -0
- paasta_tools/yaml_tools.py +44 -0
- paasta_tools-1.21.3.data/scripts/apply_external_resources.py +79 -0
- paasta_tools-1.21.3.data/scripts/bounce_log_latency_parser.py +68 -0
- paasta_tools-1.21.3.data/scripts/check_autoscaler_max_instances.py +212 -0
- paasta_tools-1.21.3.data/scripts/check_cassandracluster_services_replication.py +35 -0
- paasta_tools-1.21.3.data/scripts/check_flink_services_health.py +203 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_api.py +57 -0
- paasta_tools-1.21.3.data/scripts/check_kubernetes_services_replication.py +141 -0
- paasta_tools-1.21.3.data/scripts/check_manual_oapi_changes.sh +24 -0
- paasta_tools-1.21.3.data/scripts/check_oom_events.py +244 -0
- paasta_tools-1.21.3.data/scripts/check_orphans.py +306 -0
- paasta_tools-1.21.3.data/scripts/check_spark_jobs.py +234 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_cr.py +138 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_crd.py +145 -0
- paasta_tools-1.21.3.data/scripts/cleanup_kubernetes_jobs.py +344 -0
- paasta_tools-1.21.3.data/scripts/create_dynamodb_table.py +35 -0
- paasta_tools-1.21.3.data/scripts/create_paasta_playground.py +105 -0
- paasta_tools-1.21.3.data/scripts/delete_kubernetes_deployments.py +89 -0
- paasta_tools-1.21.3.data/scripts/emit_allocated_cpu_metrics.py +50 -0
- paasta_tools-1.21.3.data/scripts/generate_all_deployments +9 -0
- paasta_tools-1.21.3.data/scripts/generate_authenticating_services.py +94 -0
- paasta_tools-1.21.3.data/scripts/generate_deployments_for_service.py +255 -0
- paasta_tools-1.21.3.data/scripts/generate_services_file.py +114 -0
- paasta_tools-1.21.3.data/scripts/generate_services_yaml.py +30 -0
- paasta_tools-1.21.3.data/scripts/get_running_task_allocation.py +346 -0
- paasta_tools-1.21.3.data/scripts/habitat_fixer.py +86 -0
- paasta_tools-1.21.3.data/scripts/ide_helper.py +316 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_proxy.py +139 -0
- paasta_tools-1.21.3.data/scripts/is_pod_healthy_in_smartstack.py +50 -0
- paasta_tools-1.21.3.data/scripts/kill_bad_containers.py +109 -0
- paasta_tools-1.21.3.data/scripts/kubernetes_remove_evicted_pods.py +164 -0
- paasta_tools-1.21.3.data/scripts/mass-deploy-tag.sh +44 -0
- paasta_tools-1.21.3.data/scripts/mock_patch_checker.py +86 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_remote_run_resources.py +135 -0
- paasta_tools-1.21.3.data/scripts/paasta_cleanup_stale_nodes.py +181 -0
- paasta_tools-1.21.3.data/scripts/paasta_deploy_tron_jobs +3 -0
- paasta_tools-1.21.3.data/scripts/paasta_execute_docker_command.py +123 -0
- paasta_tools-1.21.3.data/scripts/paasta_secrets_sync.py +758 -0
- paasta_tools-1.21.3.data/scripts/paasta_tabcomplete.sh +23 -0
- paasta_tools-1.21.3.data/scripts/paasta_update_soa_memcpu.py +520 -0
- paasta_tools-1.21.3.data/scripts/render_template.py +129 -0
- paasta_tools-1.21.3.data/scripts/rightsizer_soaconfigs_update.py +348 -0
- paasta_tools-1.21.3.data/scripts/service_shard_remove.py +157 -0
- paasta_tools-1.21.3.data/scripts/service_shard_update.py +373 -0
- paasta_tools-1.21.3.data/scripts/setup_istio_mesh.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_cr.py +412 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_crd.py +138 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_internal_crd.py +154 -0
- paasta_tools-1.21.3.data/scripts/setup_kubernetes_job.py +353 -0
- paasta_tools-1.21.3.data/scripts/setup_prometheus_adapter_config.py +1028 -0
- paasta_tools-1.21.3.data/scripts/shared_ip_check.py +77 -0
- paasta_tools-1.21.3.data/scripts/synapse_srv_namespaces_fact.py +42 -0
- paasta_tools-1.21.3.data/scripts/timeouts_metrics_prom.py +64 -0
- paasta_tools-1.21.3.dist-info/LICENSE +201 -0
- paasta_tools-1.21.3.dist-info/METADATA +74 -0
- paasta_tools-1.21.3.dist-info/RECORD +348 -0
- paasta_tools-1.21.3.dist-info/WHEEL +5 -0
- paasta_tools-1.21.3.dist-info/entry_points.txt +20 -0
- paasta_tools-1.21.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1400 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import shlex
|
|
7
|
+
import socket
|
|
8
|
+
import sys
|
|
9
|
+
from configparser import ConfigParser
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
12
|
+
from typing import Dict
|
|
13
|
+
from typing import List
|
|
14
|
+
from typing import Mapping
|
|
15
|
+
from typing import Optional
|
|
16
|
+
from typing import Set
|
|
17
|
+
from typing import Tuple
|
|
18
|
+
from typing import Union
|
|
19
|
+
|
|
20
|
+
from service_configuration_lib import read_service_configuration
|
|
21
|
+
from service_configuration_lib import read_yaml_file
|
|
22
|
+
from service_configuration_lib import spark_config
|
|
23
|
+
from service_configuration_lib.spark_config import get_aws_credentials
|
|
24
|
+
from service_configuration_lib.spark_config import get_grafana_url
|
|
25
|
+
from service_configuration_lib.spark_config import get_resources_requested
|
|
26
|
+
from service_configuration_lib.spark_config import get_spark_hourly_cost
|
|
27
|
+
from service_configuration_lib.spark_config import UnsupportedClusterManagerException
|
|
28
|
+
|
|
29
|
+
from paasta_tools.cli.authentication import get_service_auth_token
|
|
30
|
+
from paasta_tools.cli.cmds.check import makefile_responds_to
|
|
31
|
+
from paasta_tools.cli.cmds.cook_image import paasta_cook_image
|
|
32
|
+
from paasta_tools.cli.utils import get_instance_config
|
|
33
|
+
from paasta_tools.cli.utils import lazy_choices_completer
|
|
34
|
+
from paasta_tools.cli.utils import list_instances
|
|
35
|
+
from paasta_tools.clusterman import get_clusterman_metrics
|
|
36
|
+
from paasta_tools.kubernetes_tools import get_service_account_name
|
|
37
|
+
from paasta_tools.spark_tools import auto_add_timeout_for_spark_job
|
|
38
|
+
from paasta_tools.spark_tools import create_spark_config_str
|
|
39
|
+
from paasta_tools.spark_tools import DEFAULT_SPARK_RUNTIME_TIMEOUT
|
|
40
|
+
from paasta_tools.spark_tools import DEFAULT_SPARK_SERVICE
|
|
41
|
+
from paasta_tools.spark_tools import get_volumes_from_spark_k8s_configs
|
|
42
|
+
from paasta_tools.spark_tools import get_webui_url
|
|
43
|
+
from paasta_tools.spark_tools import inject_spark_conf_str
|
|
44
|
+
from paasta_tools.tron_tools import load_tron_instance_configs
|
|
45
|
+
from paasta_tools.utils import _run
|
|
46
|
+
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
47
|
+
from paasta_tools.utils import filter_templates_from_config
|
|
48
|
+
from paasta_tools.utils import get_k8s_url_for_cluster
|
|
49
|
+
from paasta_tools.utils import get_possible_launched_by_user_variable_from_env
|
|
50
|
+
from paasta_tools.utils import get_username
|
|
51
|
+
from paasta_tools.utils import InstanceConfig
|
|
52
|
+
from paasta_tools.utils import is_using_unprivileged_containers
|
|
53
|
+
from paasta_tools.utils import list_services
|
|
54
|
+
from paasta_tools.utils import load_system_paasta_config
|
|
55
|
+
from paasta_tools.utils import NoConfigurationForServiceError
|
|
56
|
+
from paasta_tools.utils import NoDeploymentsAvailable
|
|
57
|
+
from paasta_tools.utils import NoDockerImageError
|
|
58
|
+
from paasta_tools.utils import PaastaColors
|
|
59
|
+
from paasta_tools.utils import PaastaNotConfiguredError
|
|
60
|
+
from paasta_tools.utils import PoolsNotConfiguredError
|
|
61
|
+
from paasta_tools.utils import SystemPaastaConfig
|
|
62
|
+
from paasta_tools.utils import validate_pool
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
DEFAULT_AWS_REGION = "us-west-2"
|
|
66
|
+
DEFAULT_SPARK_WORK_DIR = "/spark_driver"
|
|
67
|
+
DEFAULT_SPARK_DOCKER_IMAGE_PREFIX = "paasta-spark-run"
|
|
68
|
+
DEFAULT_SPARK_DOCKER_REGISTRY = "docker-dev.yelpcorp.com"
|
|
69
|
+
SENSITIVE_ENV = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"]
|
|
70
|
+
clusterman_metrics, CLUSTERMAN_YAML_FILE_PATH = get_clusterman_metrics()
|
|
71
|
+
CLUSTER_MANAGER_K8S = "kubernetes"
|
|
72
|
+
CLUSTER_MANAGER_LOCAL = "local"
|
|
73
|
+
CLUSTER_MANAGERS = {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}
|
|
74
|
+
DEFAULT_DOCKER_SHM_SIZE = "64m"
|
|
75
|
+
# Reference: https://spark.apache.org/docs/latest/configuration.html#application-properties
|
|
76
|
+
DEFAULT_DRIVER_CORES_BY_SPARK = 1
|
|
77
|
+
DEFAULT_DRIVER_MEMORY_BY_SPARK = "1g"
|
|
78
|
+
# Extra room for memory overhead and for any other running inside container
|
|
79
|
+
DOCKER_RESOURCE_ADJUSTMENT_FACTOR = 2
|
|
80
|
+
|
|
81
|
+
DEPRECATED_OPTS = {
|
|
82
|
+
"j": "spark.jars",
|
|
83
|
+
"jars": "spark.jars",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
SPARK_COMMANDS = {"pyspark", "spark-submit"}
|
|
87
|
+
|
|
88
|
+
# config looks as follows:
|
|
89
|
+
# [default]
|
|
90
|
+
# aws_access_key_id = ...
|
|
91
|
+
# aws_secret_access_key = ...
|
|
92
|
+
SPARK_DRIVER_IAM_USER = (
|
|
93
|
+
"/nail/etc/spark_driver_k8s_role_assumer/spark_driver_k8s_role_assumer.ini"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
log = logging.getLogger(__name__)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DeprecatedAction(argparse.Action):
|
|
100
|
+
def __init__(self, option_strings, dest, nargs="?", **kwargs):
|
|
101
|
+
super().__init__(option_strings, dest, nargs=nargs, **kwargs)
|
|
102
|
+
|
|
103
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
|
104
|
+
print(
|
|
105
|
+
PaastaColors.red(
|
|
106
|
+
f"Use of {option_string} is deprecated. "
|
|
107
|
+
+ (
|
|
108
|
+
f"Please use {DEPRECATED_OPTS.get(option_string.strip('-'), '')}=value in --spark-args."
|
|
109
|
+
if option_string.strip("-") in DEPRECATED_OPTS
|
|
110
|
+
else ""
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def add_subparser(subparsers):
|
|
117
|
+
list_parser = subparsers.add_parser(
|
|
118
|
+
"spark-run",
|
|
119
|
+
help="Run Spark on the PaaSTA cluster",
|
|
120
|
+
description=(
|
|
121
|
+
"'paasta spark-run' launches a Spark cluster on PaaSTA. "
|
|
122
|
+
"It analyzes soa-configs and command line arguments to invoke "
|
|
123
|
+
"a 'docker run'. By default, it will pull the Spark service "
|
|
124
|
+
"image from the registry unless the --build option is used.\n\n"
|
|
125
|
+
),
|
|
126
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
127
|
+
)
|
|
128
|
+
# Deprecated args kept to avoid failures
|
|
129
|
+
# TODO: Remove these deprecated args later
|
|
130
|
+
list_parser.add_argument(
|
|
131
|
+
"--jars",
|
|
132
|
+
help=argparse.SUPPRESS,
|
|
133
|
+
action=DeprecatedAction,
|
|
134
|
+
)
|
|
135
|
+
list_parser.add_argument(
|
|
136
|
+
"--executor-memory",
|
|
137
|
+
help=argparse.SUPPRESS,
|
|
138
|
+
action=DeprecatedAction,
|
|
139
|
+
)
|
|
140
|
+
list_parser.add_argument(
|
|
141
|
+
"--executor-cores",
|
|
142
|
+
help=argparse.SUPPRESS,
|
|
143
|
+
action=DeprecatedAction,
|
|
144
|
+
)
|
|
145
|
+
list_parser.add_argument(
|
|
146
|
+
"--max-cores",
|
|
147
|
+
help=argparse.SUPPRESS,
|
|
148
|
+
action=DeprecatedAction,
|
|
149
|
+
)
|
|
150
|
+
list_parser.add_argument(
|
|
151
|
+
"-e",
|
|
152
|
+
"--enable-compact-bin-packing",
|
|
153
|
+
help=argparse.SUPPRESS,
|
|
154
|
+
action=DeprecatedAction,
|
|
155
|
+
)
|
|
156
|
+
list_parser.add_argument(
|
|
157
|
+
"--enable-dra",
|
|
158
|
+
help=argparse.SUPPRESS,
|
|
159
|
+
action=DeprecatedAction,
|
|
160
|
+
)
|
|
161
|
+
list_parser.add_argument(
|
|
162
|
+
"--force-use-eks",
|
|
163
|
+
help=argparse.SUPPRESS,
|
|
164
|
+
action=DeprecatedAction,
|
|
165
|
+
)
|
|
166
|
+
list_parser.add_argument(
|
|
167
|
+
"--get-eks-token-via-iam-user",
|
|
168
|
+
help="Use IAM user to get EKS token for long running spark-run jobs",
|
|
169
|
+
action="store_true",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
group = list_parser.add_mutually_exclusive_group()
|
|
173
|
+
group.add_argument(
|
|
174
|
+
"-b",
|
|
175
|
+
"--build",
|
|
176
|
+
help="Build the docker image from scratch using the local Makefile's cook-image target.",
|
|
177
|
+
action="store_true",
|
|
178
|
+
default=False,
|
|
179
|
+
)
|
|
180
|
+
group.add_argument(
|
|
181
|
+
"-I",
|
|
182
|
+
"--image",
|
|
183
|
+
help="Use the provided image to start the Spark driver and executors.",
|
|
184
|
+
)
|
|
185
|
+
list_parser.add_argument(
|
|
186
|
+
"--docker-memory-limit",
|
|
187
|
+
help=(
|
|
188
|
+
"Set docker memory limit. Should be greater than driver memory. Defaults to 2x spark.driver.memory. Example: 2g, 500m, Max: 64g"
|
|
189
|
+
" Note: If memory limit provided is greater than associated with the batch instance, it will default to max memory of the box."
|
|
190
|
+
),
|
|
191
|
+
default=None,
|
|
192
|
+
)
|
|
193
|
+
list_parser.add_argument(
|
|
194
|
+
"--docker-cpu-limit",
|
|
195
|
+
help=(
|
|
196
|
+
"Set docker cpus limit. Should be greater than driver cores. Defaults to 1x spark.driver.cores."
|
|
197
|
+
" Note: The job will fail if the limit provided is greater than number of cores present on batch box (8 for production batch boxes)."
|
|
198
|
+
),
|
|
199
|
+
default=None,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
list_parser.add_argument(
|
|
203
|
+
"--docker-shm-size",
|
|
204
|
+
help=(
|
|
205
|
+
"Set docker shared memory size limit for the driver's container. This is the same as setting docker run --shm-size and the shared"
|
|
206
|
+
" memory is mounted to /dev/shm in the container. Anything written to the shared memory mount point counts towards the docker memory"
|
|
207
|
+
" limit for the driver's container. Therefore, this should be less than --docker-memory-limit."
|
|
208
|
+
f" Defaults to {DEFAULT_DOCKER_SHM_SIZE}. Example: 8g, 256m"
|
|
209
|
+
" Note: this option is mainly useful when training TensorFlow models in the driver, with multiple GPUs using NCCL. The shared memory"
|
|
210
|
+
f" space is used to sync gradient updates between GPUs during training. The default value of {DEFAULT_DOCKER_SHM_SIZE} is typically not large enough for"
|
|
211
|
+
" this inter-gpu communication to run efficiently. We recommend a starting value of 8g to ensure that the entire set of model parameters"
|
|
212
|
+
" can fit in the shared memory. This can be less if you are training a smaller model (<1g parameters) or more if you are using a larger model (>2.5g parameters)"
|
|
213
|
+
" If you are observing low, average GPU utilization during epoch training (<65-70 percent) you can also try increasing this value; you may be"
|
|
214
|
+
" resource constrained when GPUs sync training weights between mini-batches (there are other potential bottlenecks that could cause this as well)."
|
|
215
|
+
" A tool such as nvidia-smi can be use to check GPU utilization."
|
|
216
|
+
" This option also adds the --ulimit memlock=-1 to the docker run command since this is recommended for TensorFlow applications that use NCCL."
|
|
217
|
+
" Please refer to docker run documentation for more details on --shm-size and --ulimit memlock=-1."
|
|
218
|
+
),
|
|
219
|
+
default=None,
|
|
220
|
+
)
|
|
221
|
+
list_parser.add_argument(
|
|
222
|
+
"--force-spark-resource-configs",
|
|
223
|
+
help=(
|
|
224
|
+
"Skip the resource/instances recalculation. "
|
|
225
|
+
"This is strongly not recommended."
|
|
226
|
+
),
|
|
227
|
+
action="store_true",
|
|
228
|
+
default=False,
|
|
229
|
+
)
|
|
230
|
+
list_parser.add_argument(
|
|
231
|
+
"--docker-registry",
|
|
232
|
+
help="Docker registry to push the Spark image built.",
|
|
233
|
+
default=None,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
list_parser.add_argument(
|
|
237
|
+
"-s",
|
|
238
|
+
"--service",
|
|
239
|
+
help="The name of the service from which the Spark image is built.",
|
|
240
|
+
default=DEFAULT_SPARK_SERVICE,
|
|
241
|
+
).completer = lazy_choices_completer(list_services)
|
|
242
|
+
|
|
243
|
+
list_parser.add_argument(
|
|
244
|
+
"-i",
|
|
245
|
+
"--instance",
|
|
246
|
+
help="Start a docker run for a particular instance of the service.",
|
|
247
|
+
default="adhoc",
|
|
248
|
+
).completer = lazy_choices_completer(list_instances)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
system_paasta_config = load_system_paasta_config()
|
|
252
|
+
valid_clusters = system_paasta_config.get_clusters()
|
|
253
|
+
default_spark_cluster = system_paasta_config.get_spark_run_config().get(
|
|
254
|
+
"default_cluster"
|
|
255
|
+
)
|
|
256
|
+
default_spark_pool = system_paasta_config.get_spark_run_config().get(
|
|
257
|
+
"default_pool"
|
|
258
|
+
)
|
|
259
|
+
except PaastaNotConfiguredError:
|
|
260
|
+
default_spark_cluster = "pnw-devc-spark"
|
|
261
|
+
default_spark_pool = "batch"
|
|
262
|
+
valid_clusters = ["pnw-devc-spark", "pnw-prod-spark"]
|
|
263
|
+
|
|
264
|
+
list_parser.add_argument(
|
|
265
|
+
"-c",
|
|
266
|
+
"--cluster",
|
|
267
|
+
help="The name of the cluster you wish to run Spark on.",
|
|
268
|
+
choices=valid_clusters,
|
|
269
|
+
default=default_spark_cluster,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
list_parser.add_argument(
|
|
273
|
+
"-p",
|
|
274
|
+
"--pool",
|
|
275
|
+
help="Name of the resource pool to run the Spark job.",
|
|
276
|
+
default=default_spark_pool,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
list_parser.add_argument(
|
|
280
|
+
"-w",
|
|
281
|
+
"--work-dir",
|
|
282
|
+
default="{}:{}".format(os.getcwd(), DEFAULT_SPARK_WORK_DIR),
|
|
283
|
+
help="The read-write volume to mount in format local_abs_dir:container_abs_dir",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
list_parser.add_argument(
|
|
287
|
+
"-y",
|
|
288
|
+
"--yelpsoa-config-root",
|
|
289
|
+
dest="yelpsoa_config_root",
|
|
290
|
+
help="A directory from which yelpsoa-configs should be read from.",
|
|
291
|
+
default=DEFAULT_SOA_DIR,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
list_parser.add_argument(
|
|
295
|
+
"-C",
|
|
296
|
+
"--cmd",
|
|
297
|
+
help="Run the spark-shell, pyspark, spark-submit, jupyter-lab, or history-server command.",
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
list_parser.add_argument(
|
|
301
|
+
"--timeout-job-runtime",
|
|
302
|
+
type=str,
|
|
303
|
+
help="Timeout value which will be added before spark-submit. Job will exit if it doesn't finish in given "
|
|
304
|
+
"runtime. Recommended value: 2 * expected runtime. Example: 1h, 30m",
|
|
305
|
+
default=DEFAULT_SPARK_RUNTIME_TIMEOUT,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
list_parser.add_argument(
|
|
309
|
+
"-d",
|
|
310
|
+
"--dry-run",
|
|
311
|
+
help="Shows the arguments supplied to docker as json.",
|
|
312
|
+
action="store_true",
|
|
313
|
+
default=False,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
list_parser.add_argument(
|
|
317
|
+
"--spark-args",
|
|
318
|
+
help="Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html, "
|
|
319
|
+
'separated by space. For example, --spark-args "spark.executor.cores=1 spark.executor.memory=7g '
|
|
320
|
+
'spark.executor.instances=2".',
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
list_parser.add_argument(
|
|
324
|
+
"--nvidia",
|
|
325
|
+
help="Use nvidia docker runtime for Spark driver process (requires GPU)",
|
|
326
|
+
action="store_true",
|
|
327
|
+
default=False,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
list_parser.add_argument(
|
|
331
|
+
"--mrjob",
|
|
332
|
+
help="Pass Spark arguments to invoked command in the format expected by mrjobs",
|
|
333
|
+
action="store_true",
|
|
334
|
+
default=False,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
list_parser.add_argument(
|
|
338
|
+
"--cluster-manager",
|
|
339
|
+
help="Specify which cluster manager to use. Support for certain cluster managers may be experimental",
|
|
340
|
+
dest="cluster_manager",
|
|
341
|
+
choices=CLUSTER_MANAGERS,
|
|
342
|
+
default=CLUSTER_MANAGER_K8S,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
list_parser.add_argument(
|
|
346
|
+
"--tronfig",
|
|
347
|
+
help="Load the Tron config yaml. Use with --job-id.",
|
|
348
|
+
type=str,
|
|
349
|
+
default=None,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
list_parser.add_argument(
|
|
353
|
+
"--job-id",
|
|
354
|
+
help="Tron job id <job_name>.<action_name> in the Tronfig to run. Use wuth --tronfig.",
|
|
355
|
+
type=str,
|
|
356
|
+
default=None,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
list_parser.add_argument(
|
|
360
|
+
"--use-service-auth-token",
|
|
361
|
+
help=(
|
|
362
|
+
"Acquire service authentication token for the underlying instance,"
|
|
363
|
+
" and set it in the container environment"
|
|
364
|
+
),
|
|
365
|
+
action="store_true",
|
|
366
|
+
dest="use_service_auth_token",
|
|
367
|
+
required=False,
|
|
368
|
+
default=False,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
list_parser.add_argument(
|
|
372
|
+
"--uses-bulkdata",
|
|
373
|
+
help="Mount /nail/bulkdata in the container",
|
|
374
|
+
action="store_true",
|
|
375
|
+
default=False,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
aws_group = list_parser.add_argument_group(
|
|
379
|
+
title="AWS credentials options",
|
|
380
|
+
description="If --aws-credentials-yaml is specified, it overrides all "
|
|
381
|
+
"other options. Otherwise, if -s/--service is specified, spark-run "
|
|
382
|
+
"looks for service credentials in /etc/boto_cfg/[service].yaml. If "
|
|
383
|
+
"it does not find the service credentials or no service is "
|
|
384
|
+
"specified, spark-run falls back to the boto default behavior "
|
|
385
|
+
"(checking ~/.aws/credentials, ~/.boto, etc).",
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
aws_group.add_argument(
|
|
389
|
+
"--aws-credentials-yaml",
|
|
390
|
+
help="Load aws keys from the provided yaml file. The yaml file must "
|
|
391
|
+
"have keys for aws_access_key_id and aws_secret_access_key.",
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
aws_group.add_argument(
|
|
395
|
+
"--aws-profile",
|
|
396
|
+
help="Name of the AWS profile to load credentials from. Only used when "
|
|
397
|
+
"--aws-credentials-yaml is not specified and --service is either "
|
|
398
|
+
"not specified or the service does not have credentials in "
|
|
399
|
+
"/etc/boto_cfg",
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
aws_group.add_argument(
|
|
403
|
+
"--aws-region",
|
|
404
|
+
help=f"Specify an aws region. If the region is not specified, we will"
|
|
405
|
+
f"default to using {DEFAULT_AWS_REGION}.",
|
|
406
|
+
default=DEFAULT_AWS_REGION,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
aws_group.add_argument(
|
|
410
|
+
"--assume-aws-role",
|
|
411
|
+
help=(
|
|
412
|
+
"Takes an AWS IAM role ARN and attempts to create a session using "
|
|
413
|
+
"spark_role_assumer"
|
|
414
|
+
),
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
aws_group.add_argument(
|
|
418
|
+
"--aws-role-duration",
|
|
419
|
+
help=(
|
|
420
|
+
"Duration in seconds for the role if --assume-aws-role provided. "
|
|
421
|
+
"The maximum is 43200, but by default, roles may only allow 3600."
|
|
422
|
+
),
|
|
423
|
+
type=int,
|
|
424
|
+
default=43200,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
aws_group.add_argument(
|
|
428
|
+
"--use-web-identity",
|
|
429
|
+
help=(
|
|
430
|
+
"If the current environment contains AWS_ROLE_ARN and "
|
|
431
|
+
"AWS_WEB_IDENTITY_TOKEN_FILE, creates a session to use. These "
|
|
432
|
+
"ENV vars must be present, and will be in the context of a pod-"
|
|
433
|
+
"identity enabled pod."
|
|
434
|
+
),
|
|
435
|
+
action="store_true",
|
|
436
|
+
default=False,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
aws_group.add_argument(
|
|
440
|
+
"--force-pod-identity",
|
|
441
|
+
help=(
|
|
442
|
+
"Normally the spark executor will use the pod identity defined "
|
|
443
|
+
"for the relevant instance in yelpsoa-configs. If the instance "
|
|
444
|
+
"isn't setup there yet, you can override the IAM role arn here."
|
|
445
|
+
" However, it must already be set for a different instance of "
|
|
446
|
+
"the service. Must be used with --executor-pod-identity."
|
|
447
|
+
),
|
|
448
|
+
default=None,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
aws_group.add_argument(
|
|
452
|
+
"--executor-pod-identity",
|
|
453
|
+
help=(
|
|
454
|
+
"Launch the executor pod with pod-identity derived from "
|
|
455
|
+
"the iam_role settings attached to the instance settings in "
|
|
456
|
+
"SOA configs. See also --force-pod-identity."
|
|
457
|
+
),
|
|
458
|
+
action="store_true",
|
|
459
|
+
default=False,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
jupyter_group = list_parser.add_argument_group(
|
|
463
|
+
title="Jupyter kernel culling options",
|
|
464
|
+
description="Idle kernels will be culled by default. Idle "
|
|
465
|
+
"kernels with connections can be overridden not to be culled.",
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
jupyter_group.add_argument(
|
|
469
|
+
"--cull-idle-timeout",
|
|
470
|
+
type=int,
|
|
471
|
+
default=7200,
|
|
472
|
+
help="Timeout (in seconds) after which a kernel is considered idle and "
|
|
473
|
+
"ready to be culled.",
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
jupyter_group.add_argument(
|
|
477
|
+
"--not-cull-connected",
|
|
478
|
+
action="store_true",
|
|
479
|
+
default=False,
|
|
480
|
+
help="By default, connected idle kernels are culled after timeout. "
|
|
481
|
+
"They can be skipped if not-cull-connected is specified.",
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
list_parser.set_defaults(command=paasta_spark_run)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def sanitize_container_name(container_name):
|
|
488
|
+
# container_name only allows [a-zA-Z0-9][a-zA-Z0-9_.-]
|
|
489
|
+
return re.sub("[^a-zA-Z0-9_.-]", "_", re.sub("^[^a-zA-Z0-9]+", "", container_name))
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def get_docker_run_cmd(
|
|
493
|
+
container_name,
|
|
494
|
+
volumes,
|
|
495
|
+
env,
|
|
496
|
+
docker_img,
|
|
497
|
+
docker_cmd,
|
|
498
|
+
nvidia,
|
|
499
|
+
docker_memory_limit,
|
|
500
|
+
docker_shm_size,
|
|
501
|
+
docker_cpu_limit,
|
|
502
|
+
):
|
|
503
|
+
print(
|
|
504
|
+
f"Setting docker memory, shared memory, and cpu limits as {docker_memory_limit}, {docker_shm_size}, and {docker_cpu_limit} core(s) respectively."
|
|
505
|
+
)
|
|
506
|
+
cmd = ["paasta_docker_wrapper", "run"]
|
|
507
|
+
cmd.append(f"--memory={docker_memory_limit}")
|
|
508
|
+
if docker_shm_size is not None:
|
|
509
|
+
cmd.append(f"--shm-size={docker_shm_size}")
|
|
510
|
+
cmd.append("--ulimit")
|
|
511
|
+
cmd.append("memlock=-1")
|
|
512
|
+
cmd.append(f"--cpus={docker_cpu_limit}")
|
|
513
|
+
cmd.append("--rm")
|
|
514
|
+
cmd.append("--net=host")
|
|
515
|
+
|
|
516
|
+
non_interactive_cmd = ["spark-submit", "history-server"]
|
|
517
|
+
if not any(c in docker_cmd for c in non_interactive_cmd):
|
|
518
|
+
cmd.append("--interactive=true")
|
|
519
|
+
if sys.stdout.isatty():
|
|
520
|
+
cmd.append("--tty=true")
|
|
521
|
+
|
|
522
|
+
container_user = (
|
|
523
|
+
# root inside container == current user outside
|
|
524
|
+
(0, 0)
|
|
525
|
+
if is_using_unprivileged_containers()
|
|
526
|
+
else (os.geteuid(), os.getegid())
|
|
527
|
+
)
|
|
528
|
+
cmd.append("--user=%d:%d" % container_user)
|
|
529
|
+
cmd.append("--name=%s" % sanitize_container_name(container_name))
|
|
530
|
+
for k, v in env.items():
|
|
531
|
+
cmd.append("--env")
|
|
532
|
+
if k in SENSITIVE_ENV:
|
|
533
|
+
cmd.append(k)
|
|
534
|
+
else:
|
|
535
|
+
cmd.append(f"{k}={v}")
|
|
536
|
+
if is_using_unprivileged_containers():
|
|
537
|
+
cmd.append("--env")
|
|
538
|
+
cmd.append(f"HOME=/nail/home/{get_username()}")
|
|
539
|
+
if nvidia:
|
|
540
|
+
cmd.append("--env")
|
|
541
|
+
cmd.append("NVIDIA_VISIBLE_DEVICES=all")
|
|
542
|
+
cmd.append("--runtime=nvidia")
|
|
543
|
+
for volume in volumes:
|
|
544
|
+
cmd.append("--volume=%s" % volume)
|
|
545
|
+
cmd.append("%s" % docker_img)
|
|
546
|
+
cmd.extend(("sh", "-c", docker_cmd))
|
|
547
|
+
|
|
548
|
+
return cmd
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def get_docker_image(
|
|
552
|
+
args: argparse.Namespace, instance_config: InstanceConfig
|
|
553
|
+
) -> Optional[str]:
|
|
554
|
+
"""
|
|
555
|
+
Since the Docker image digest used to launch the Spark cluster is obtained by inspecting local
|
|
556
|
+
Docker images, we need to ensure that the Docker image exists locally or is pulled in all scenarios.
|
|
557
|
+
"""
|
|
558
|
+
# docker image is built locally then pushed
|
|
559
|
+
if args.build:
|
|
560
|
+
return build_and_push_docker_image(args)
|
|
561
|
+
|
|
562
|
+
docker_url = ""
|
|
563
|
+
if args.image:
|
|
564
|
+
docker_url = args.image
|
|
565
|
+
else:
|
|
566
|
+
try:
|
|
567
|
+
docker_url = instance_config.get_docker_url()
|
|
568
|
+
except NoDockerImageError:
|
|
569
|
+
print(
|
|
570
|
+
PaastaColors.red(
|
|
571
|
+
"Error: No sha has been marked for deployment for the %s deploy group.\n"
|
|
572
|
+
"Please ensure this service has either run through a jenkins pipeline "
|
|
573
|
+
"or paasta mark-for-deployment has been run for %s\n"
|
|
574
|
+
% (instance_config.get_deploy_group(), args.service)
|
|
575
|
+
),
|
|
576
|
+
sep="",
|
|
577
|
+
file=sys.stderr,
|
|
578
|
+
)
|
|
579
|
+
return None
|
|
580
|
+
|
|
581
|
+
print(
|
|
582
|
+
"Please wait while the image (%s) is pulled (times out after 5m)..."
|
|
583
|
+
% docker_url,
|
|
584
|
+
file=sys.stderr,
|
|
585
|
+
)
|
|
586
|
+
# Need sudo for credentials when pulling images from paasta docker registry (docker-paasta.yelpcorp.com)
|
|
587
|
+
# However, in CI env, we can't connect to docker via root and we can pull with user `jenkins`
|
|
588
|
+
is_ci_env = "CI" in os.environ
|
|
589
|
+
cmd_prefix = "" if is_ci_env else "sudo -H "
|
|
590
|
+
retcode, _ = _run(f"{cmd_prefix}docker pull {docker_url}", stream=True, timeout=300)
|
|
591
|
+
if retcode != 0:
|
|
592
|
+
print(
|
|
593
|
+
"\nPull failed. Are you authorized to run docker commands?",
|
|
594
|
+
file=sys.stderr,
|
|
595
|
+
)
|
|
596
|
+
return None
|
|
597
|
+
return docker_url
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def get_smart_paasta_instance_name(args):
|
|
601
|
+
if os.environ.get("TRON_JOB_NAMESPACE"):
|
|
602
|
+
tron_job = os.environ.get("TRON_JOB_NAME")
|
|
603
|
+
tron_action = os.environ.get("TRON_ACTION")
|
|
604
|
+
return f"{tron_job}.{tron_action}"
|
|
605
|
+
else:
|
|
606
|
+
how_submitted = None
|
|
607
|
+
if args.mrjob:
|
|
608
|
+
how_submitted = "mrjob"
|
|
609
|
+
else:
|
|
610
|
+
for spark_cmd in SPARK_COMMANDS:
|
|
611
|
+
if spark_cmd in args.cmd:
|
|
612
|
+
how_submitted = spark_cmd
|
|
613
|
+
break
|
|
614
|
+
how_submitted = how_submitted or "other"
|
|
615
|
+
return f"{args.instance}_{get_username()}_{how_submitted}"
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def get_spark_env(
|
|
619
|
+
args: argparse.Namespace,
|
|
620
|
+
spark_conf_str: str,
|
|
621
|
+
aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
|
|
622
|
+
ui_port: str,
|
|
623
|
+
system_paasta_config: SystemPaastaConfig,
|
|
624
|
+
) -> Dict[str, str]:
|
|
625
|
+
"""Create the env config dict to configure on the docker container"""
|
|
626
|
+
|
|
627
|
+
spark_env = {}
|
|
628
|
+
access_key, secret_key, session_token = aws_creds
|
|
629
|
+
if access_key:
|
|
630
|
+
spark_env["AWS_ACCESS_KEY_ID"] = access_key
|
|
631
|
+
spark_env["AWS_SECRET_ACCESS_KEY"] = secret_key
|
|
632
|
+
if session_token is not None:
|
|
633
|
+
spark_env["AWS_SESSION_TOKEN"] = session_token
|
|
634
|
+
|
|
635
|
+
spark_env["AWS_DEFAULT_REGION"] = args.aws_region
|
|
636
|
+
spark_env["PAASTA_LAUNCHED_BY"] = get_possible_launched_by_user_variable_from_env()
|
|
637
|
+
spark_env["PAASTA_INSTANCE_TYPE"] = "spark"
|
|
638
|
+
|
|
639
|
+
# Run spark (and mesos framework) as root.
|
|
640
|
+
spark_env["SPARK_USER"] = "root"
|
|
641
|
+
spark_env["SPARK_OPTS"] = spark_conf_str
|
|
642
|
+
|
|
643
|
+
# Default configs to start the jupyter notebook server
|
|
644
|
+
if args.cmd == "jupyter-lab":
|
|
645
|
+
spark_env["JUPYTER_RUNTIME_DIR"] = "/source/.jupyter"
|
|
646
|
+
spark_env["JUPYTER_DATA_DIR"] = "/source/.jupyter"
|
|
647
|
+
spark_env["JUPYTER_CONFIG_DIR"] = "/source/.jupyter"
|
|
648
|
+
elif args.cmd == "history-server":
|
|
649
|
+
dirs = args.work_dir.split(":")
|
|
650
|
+
spark_env["SPARK_LOG_DIR"] = dirs[1]
|
|
651
|
+
if not args.spark_args or not args.spark_args.startswith(
|
|
652
|
+
"spark.history.fs.logDirectory"
|
|
653
|
+
):
|
|
654
|
+
print(
|
|
655
|
+
"history-server requires spark.history.fs.logDirectory in spark-args",
|
|
656
|
+
file=sys.stderr,
|
|
657
|
+
)
|
|
658
|
+
sys.exit(1)
|
|
659
|
+
spark_env["SPARK_HISTORY_OPTS"] = (
|
|
660
|
+
f"-D{args.spark_args} " f"-Dspark.history.ui.port={ui_port}"
|
|
661
|
+
)
|
|
662
|
+
spark_env["SPARK_DAEMON_CLASSPATH"] = "/opt/spark/extra_jars/*"
|
|
663
|
+
spark_env["SPARK_NO_DAEMONIZE"] = "true"
|
|
664
|
+
|
|
665
|
+
if args.get_eks_token_via_iam_user:
|
|
666
|
+
with open(SPARK_DRIVER_IAM_USER) as f:
|
|
667
|
+
config = ConfigParser()
|
|
668
|
+
config.read_file(f)
|
|
669
|
+
|
|
670
|
+
# these env variables are consumed by a script specified in the spark kubeconfig - and which will result in a tightly-scoped IAM identity being used for EKS cluster access
|
|
671
|
+
spark_env["GET_EKS_TOKEN_AWS_ACCESS_KEY_ID"] = config["default"][
|
|
672
|
+
"aws_access_key_id"
|
|
673
|
+
]
|
|
674
|
+
spark_env["GET_EKS_TOKEN_AWS_SECRET_ACCESS_KEY"] = config["default"][
|
|
675
|
+
"aws_secret_access_key"
|
|
676
|
+
]
|
|
677
|
+
|
|
678
|
+
spark_env["KUBECONFIG"] = system_paasta_config.get_spark_iam_user_kubeconfig()
|
|
679
|
+
else:
|
|
680
|
+
spark_env["KUBECONFIG"] = system_paasta_config.get_spark_kubeconfig()
|
|
681
|
+
|
|
682
|
+
return spark_env
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def get_all_iam_roles_for_service(
|
|
686
|
+
service: str,
|
|
687
|
+
cluster: str,
|
|
688
|
+
) -> Set[str]:
|
|
689
|
+
tron_instance_configs = load_tron_instance_configs(service, cluster)
|
|
690
|
+
roles = set()
|
|
691
|
+
for action in tron_instance_configs:
|
|
692
|
+
role = action.get_iam_role()
|
|
693
|
+
if role:
|
|
694
|
+
roles.add(role)
|
|
695
|
+
return roles
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def _parse_user_spark_args(
|
|
699
|
+
spark_args: str,
|
|
700
|
+
) -> Dict[str, str]:
|
|
701
|
+
|
|
702
|
+
user_spark_opts = {}
|
|
703
|
+
if spark_args:
|
|
704
|
+
for spark_arg in spark_args.split():
|
|
705
|
+
fields = spark_arg.split("=", 1)
|
|
706
|
+
if len(fields) != 2:
|
|
707
|
+
print(
|
|
708
|
+
PaastaColors.red(
|
|
709
|
+
"Spark option %s is not in format option=value." % spark_arg
|
|
710
|
+
),
|
|
711
|
+
file=sys.stderr,
|
|
712
|
+
)
|
|
713
|
+
sys.exit(1)
|
|
714
|
+
user_spark_opts[fields[0]] = fields[1]
|
|
715
|
+
|
|
716
|
+
return user_spark_opts
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def run_docker_container(
|
|
720
|
+
container_name,
|
|
721
|
+
volumes,
|
|
722
|
+
environment,
|
|
723
|
+
docker_img,
|
|
724
|
+
docker_cmd,
|
|
725
|
+
dry_run,
|
|
726
|
+
nvidia,
|
|
727
|
+
docker_memory_limit,
|
|
728
|
+
docker_shm_size,
|
|
729
|
+
docker_cpu_limit,
|
|
730
|
+
) -> int:
|
|
731
|
+
|
|
732
|
+
docker_run_args = dict(
|
|
733
|
+
container_name=container_name,
|
|
734
|
+
volumes=volumes,
|
|
735
|
+
env=environment,
|
|
736
|
+
docker_img=docker_img,
|
|
737
|
+
docker_cmd=docker_cmd,
|
|
738
|
+
nvidia=nvidia,
|
|
739
|
+
docker_memory_limit=docker_memory_limit,
|
|
740
|
+
docker_shm_size=docker_shm_size,
|
|
741
|
+
docker_cpu_limit=docker_cpu_limit,
|
|
742
|
+
)
|
|
743
|
+
docker_run_cmd = get_docker_run_cmd(**docker_run_args)
|
|
744
|
+
if dry_run:
|
|
745
|
+
print(json.dumps(docker_run_cmd))
|
|
746
|
+
return 0
|
|
747
|
+
|
|
748
|
+
merged_env = {**os.environ, **environment}
|
|
749
|
+
os.execlpe("paasta_docker_wrapper", *docker_run_cmd, merged_env)
|
|
750
|
+
return 0
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def get_spark_app_name(original_docker_cmd: Union[Any, str, List[str]]) -> str:
|
|
754
|
+
"""Use submitted batch name as default spark_run job name"""
|
|
755
|
+
docker_cmds = (
|
|
756
|
+
shlex.split(original_docker_cmd)
|
|
757
|
+
if isinstance(original_docker_cmd, str)
|
|
758
|
+
else original_docker_cmd
|
|
759
|
+
)
|
|
760
|
+
spark_app_name = None
|
|
761
|
+
after_spark_submit = False
|
|
762
|
+
for arg in docker_cmds:
|
|
763
|
+
if arg == "spark-submit":
|
|
764
|
+
after_spark_submit = True
|
|
765
|
+
elif after_spark_submit and arg.endswith(".py"):
|
|
766
|
+
batch_name = arg.split("/")[-1].replace(".py", "")
|
|
767
|
+
spark_app_name = "paasta_" + batch_name
|
|
768
|
+
break
|
|
769
|
+
elif arg == "jupyter-lab":
|
|
770
|
+
spark_app_name = "paasta_jupyter"
|
|
771
|
+
break
|
|
772
|
+
|
|
773
|
+
if spark_app_name is None:
|
|
774
|
+
spark_app_name = "paasta_spark_run"
|
|
775
|
+
|
|
776
|
+
spark_app_name += f"_{get_username()}"
|
|
777
|
+
|
|
778
|
+
return spark_app_name
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def _calculate_docker_memory_limit(
|
|
782
|
+
spark_conf: Mapping[str, str], memory_limit: Optional[str]
|
|
783
|
+
) -> str:
|
|
784
|
+
"""In Order of preference:
|
|
785
|
+
1. Argument: --docker-memory-limit
|
|
786
|
+
2. --spark-args or spark-submit: spark.driver.memory
|
|
787
|
+
3. Default
|
|
788
|
+
"""
|
|
789
|
+
if memory_limit:
|
|
790
|
+
return memory_limit
|
|
791
|
+
|
|
792
|
+
try:
|
|
793
|
+
docker_memory_limit_str = spark_conf.get(
|
|
794
|
+
"spark.driver.memory", DEFAULT_DRIVER_MEMORY_BY_SPARK
|
|
795
|
+
)
|
|
796
|
+
adjustment_factor = DOCKER_RESOURCE_ADJUSTMENT_FACTOR
|
|
797
|
+
match = re.match(r"([0-9]+)([a-z]*)", docker_memory_limit_str)
|
|
798
|
+
memory_val = int(match[1]) * adjustment_factor
|
|
799
|
+
memory_unit = match[2]
|
|
800
|
+
docker_memory_limit = f"{memory_val}{memory_unit}"
|
|
801
|
+
except Exception as e:
|
|
802
|
+
# For any reason it fails, continue with default value
|
|
803
|
+
print(
|
|
804
|
+
f"ERROR: Failed to parse docker memory limit. Error: {e}. Example values: 1g, 200m."
|
|
805
|
+
)
|
|
806
|
+
raise
|
|
807
|
+
|
|
808
|
+
return docker_memory_limit
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def _calculate_docker_shared_memory_size(shm_size: Optional[str]) -> str:
|
|
812
|
+
"""In Order of preference:
|
|
813
|
+
1. Argument: --docker-shm-size
|
|
814
|
+
3. Default
|
|
815
|
+
"""
|
|
816
|
+
if shm_size:
|
|
817
|
+
return shm_size
|
|
818
|
+
|
|
819
|
+
return DEFAULT_DOCKER_SHM_SIZE
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _calculate_docker_cpu_limit(
|
|
823
|
+
spark_conf: Mapping[str, str], cpu_limit: Optional[str]
|
|
824
|
+
) -> str:
|
|
825
|
+
"""In Order of preference:
|
|
826
|
+
1. Argument: --docker-cpu-limit
|
|
827
|
+
2. --spark-args or spark-submit: spark.driver.cores
|
|
828
|
+
3. Default
|
|
829
|
+
"""
|
|
830
|
+
return (
|
|
831
|
+
cpu_limit
|
|
832
|
+
if cpu_limit
|
|
833
|
+
else spark_conf.get("spark.driver.cores", str(DEFAULT_DRIVER_CORES_BY_SPARK))
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def configure_and_run_docker_container(
|
|
838
|
+
args: argparse.Namespace,
|
|
839
|
+
docker_img: str,
|
|
840
|
+
instance_config: InstanceConfig,
|
|
841
|
+
system_paasta_config: SystemPaastaConfig,
|
|
842
|
+
spark_conf: Dict[str, str],
|
|
843
|
+
aws_creds: Tuple[Optional[str], Optional[str], Optional[str]],
|
|
844
|
+
cluster_manager: str,
|
|
845
|
+
pod_template_path: str,
|
|
846
|
+
extra_driver_envs: Dict[str, str] = dict(),
|
|
847
|
+
) -> int:
|
|
848
|
+
docker_memory_limit = _calculate_docker_memory_limit(
|
|
849
|
+
spark_conf, args.docker_memory_limit
|
|
850
|
+
)
|
|
851
|
+
docker_shm_size = _calculate_docker_shared_memory_size(args.docker_shm_size)
|
|
852
|
+
docker_cpu_limit = _calculate_docker_cpu_limit(
|
|
853
|
+
spark_conf,
|
|
854
|
+
args.docker_cpu_limit,
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
if cluster_manager in {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}:
|
|
858
|
+
# service_configuration_lib puts volumes into the k8s
|
|
859
|
+
# configs for local mode
|
|
860
|
+
volumes = get_volumes_from_spark_k8s_configs(spark_conf)
|
|
861
|
+
else:
|
|
862
|
+
raise UnsupportedClusterManagerException(cluster_manager)
|
|
863
|
+
|
|
864
|
+
volumes.append("%s:rw" % args.work_dir)
|
|
865
|
+
volumes.append("/nail/home:/nail/home:rw")
|
|
866
|
+
|
|
867
|
+
if pod_template_path:
|
|
868
|
+
volumes.append(f"{pod_template_path}:{pod_template_path}:rw")
|
|
869
|
+
|
|
870
|
+
# NOTE: we mount a directory here since the kubeconfig we're transitioning to requires a helper script that will co-exist in the same directory
|
|
871
|
+
kubeconfig_dir = os.path.dirname(system_paasta_config.get_spark_kubeconfig())
|
|
872
|
+
volumes.append(f"{kubeconfig_dir}:{kubeconfig_dir}:ro")
|
|
873
|
+
|
|
874
|
+
environment = instance_config.get_env_dictionary() # type: ignore
|
|
875
|
+
spark_conf_str = create_spark_config_str(spark_conf, is_mrjob=args.mrjob)
|
|
876
|
+
environment.update(
|
|
877
|
+
get_spark_env(
|
|
878
|
+
args=args,
|
|
879
|
+
spark_conf_str=spark_conf_str,
|
|
880
|
+
aws_creds=aws_creds,
|
|
881
|
+
ui_port=spark_conf["spark.ui.port"],
|
|
882
|
+
system_paasta_config=system_paasta_config,
|
|
883
|
+
)
|
|
884
|
+
) # type:ignore
|
|
885
|
+
environment.update(extra_driver_envs)
|
|
886
|
+
|
|
887
|
+
if args.use_service_auth_token:
|
|
888
|
+
environment["YELP_SVC_AUTHZ_TOKEN"] = get_service_auth_token()
|
|
889
|
+
|
|
890
|
+
webui_url = get_webui_url(spark_conf["spark.ui.port"])
|
|
891
|
+
webui_url_msg = PaastaColors.green(f"\nSpark monitoring URL: ") + f"{webui_url}\n"
|
|
892
|
+
|
|
893
|
+
docker_cmd = get_docker_cmd(args, instance_config, spark_conf_str)
|
|
894
|
+
if "history-server" in docker_cmd:
|
|
895
|
+
print(PaastaColors.green(f"\nSpark history server URL: ") + f"{webui_url}\n")
|
|
896
|
+
elif any(c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]):
|
|
897
|
+
grafana_url = get_grafana_url(spark_conf)
|
|
898
|
+
dashboard_url_msg = (
|
|
899
|
+
PaastaColors.green(f"\nGrafana dashboard: ") + f"{grafana_url}\n"
|
|
900
|
+
)
|
|
901
|
+
print(webui_url_msg)
|
|
902
|
+
print(dashboard_url_msg)
|
|
903
|
+
log.info(webui_url_msg)
|
|
904
|
+
log.info(dashboard_url_msg)
|
|
905
|
+
spark_conf_builder = spark_config.SparkConfBuilder()
|
|
906
|
+
history_server_url = spark_conf_builder.get_history_url(spark_conf)
|
|
907
|
+
if history_server_url:
|
|
908
|
+
history_server_url_msg = (
|
|
909
|
+
f"\nAfter the job is finished, you can find the spark UI from {history_server_url}\n"
|
|
910
|
+
"Check y/spark-recent-history for faster access to prod logs\n"
|
|
911
|
+
)
|
|
912
|
+
print(history_server_url_msg)
|
|
913
|
+
log.info(history_server_url_msg)
|
|
914
|
+
print(f"Selected cluster manager: {cluster_manager}\n")
|
|
915
|
+
|
|
916
|
+
if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob):
|
|
917
|
+
resources = get_resources_requested(spark_conf)
|
|
918
|
+
hourly_cost = get_spark_hourly_cost(
|
|
919
|
+
clusterman_metrics,
|
|
920
|
+
resources,
|
|
921
|
+
spark_conf["spark.executorEnv.PAASTA_CLUSTER"],
|
|
922
|
+
args.pool,
|
|
923
|
+
)
|
|
924
|
+
message = (
|
|
925
|
+
f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)"
|
|
926
|
+
f" is estimated to cost ${hourly_cost} per hour"
|
|
927
|
+
)
|
|
928
|
+
if clusterman_metrics.util.costs.should_warn(hourly_cost):
|
|
929
|
+
print(PaastaColors.red(f"WARNING: {message}"))
|
|
930
|
+
else:
|
|
931
|
+
print(message)
|
|
932
|
+
|
|
933
|
+
return run_docker_container(
|
|
934
|
+
container_name=spark_conf["spark.app.name"],
|
|
935
|
+
volumes=volumes,
|
|
936
|
+
environment=environment,
|
|
937
|
+
docker_img=docker_img,
|
|
938
|
+
docker_cmd=docker_cmd,
|
|
939
|
+
dry_run=args.dry_run,
|
|
940
|
+
nvidia=args.nvidia,
|
|
941
|
+
docker_memory_limit=docker_memory_limit,
|
|
942
|
+
docker_shm_size=docker_shm_size,
|
|
943
|
+
docker_cpu_limit=docker_cpu_limit,
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _should_get_resource_requirements(docker_cmd: str, is_mrjob: bool) -> bool:
|
|
948
|
+
return is_mrjob or any(
|
|
949
|
+
c in docker_cmd for c in ["pyspark", "spark-shell", "spark-submit"]
|
|
950
|
+
)
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def get_docker_cmd(
|
|
954
|
+
args: argparse.Namespace, instance_config: InstanceConfig, spark_conf_str: str
|
|
955
|
+
) -> str:
|
|
956
|
+
original_docker_cmd = str(args.cmd or instance_config.get_cmd())
|
|
957
|
+
|
|
958
|
+
if args.mrjob:
|
|
959
|
+
return original_docker_cmd + " " + spark_conf_str
|
|
960
|
+
# Default cli options to start the jupyter notebook server.
|
|
961
|
+
elif original_docker_cmd == "jupyter-lab":
|
|
962
|
+
cull_opts = (
|
|
963
|
+
"--MappingKernelManager.cull_idle_timeout=%s " % args.cull_idle_timeout
|
|
964
|
+
)
|
|
965
|
+
if args.not_cull_connected is False:
|
|
966
|
+
cull_opts += "--MappingKernelManager.cull_connected=True "
|
|
967
|
+
|
|
968
|
+
return "SHELL=bash USER={} /source/virtualenv_run_jupyter/bin/jupyter-lab -y --ip={} {}".format(
|
|
969
|
+
get_username(), socket.getfqdn(), cull_opts
|
|
970
|
+
)
|
|
971
|
+
elif original_docker_cmd == "history-server":
|
|
972
|
+
return "start-history-server.sh"
|
|
973
|
+
# Spark options are passed as options to pyspark and spark-shell.
|
|
974
|
+
# For jupyter, environment variable SPARK_OPTS is set instead.
|
|
975
|
+
else:
|
|
976
|
+
return inject_spark_conf_str(original_docker_cmd, spark_conf_str)
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def _get_adhoc_docker_registry(service: str, soa_dir: str = DEFAULT_SOA_DIR) -> str:
|
|
980
|
+
if service is None:
|
|
981
|
+
raise NotImplementedError('"None" is not a valid service')
|
|
982
|
+
|
|
983
|
+
service_configuration = read_service_configuration(service, soa_dir)
|
|
984
|
+
return service_configuration.get("docker_registry", DEFAULT_SPARK_DOCKER_REGISTRY)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def build_and_push_docker_image(args: argparse.Namespace) -> Optional[str]:
|
|
988
|
+
"""
|
|
989
|
+
Build an image if the default Spark service image is not preferred.
|
|
990
|
+
The image needs to be pushed to a registry for the Spark executors
|
|
991
|
+
to pull.
|
|
992
|
+
"""
|
|
993
|
+
if not makefile_responds_to("cook-image"):
|
|
994
|
+
print(
|
|
995
|
+
"A local Makefile with a 'cook-image' target is required for --build",
|
|
996
|
+
file=sys.stderr,
|
|
997
|
+
)
|
|
998
|
+
return None
|
|
999
|
+
|
|
1000
|
+
default_tag = "{}-{}".format(DEFAULT_SPARK_DOCKER_IMAGE_PREFIX, get_username())
|
|
1001
|
+
docker_tag = os.environ.get("DOCKER_TAG", default_tag)
|
|
1002
|
+
os.environ["DOCKER_TAG"] = docker_tag
|
|
1003
|
+
|
|
1004
|
+
cook_return = paasta_cook_image(
|
|
1005
|
+
args=None, service=args.service, soa_dir=args.yelpsoa_config_root
|
|
1006
|
+
)
|
|
1007
|
+
if cook_return != 0:
|
|
1008
|
+
return None
|
|
1009
|
+
|
|
1010
|
+
registry_uri = args.docker_registry or _get_adhoc_docker_registry(
|
|
1011
|
+
service=args.service,
|
|
1012
|
+
soa_dir=args.yelpsoa_config_root,
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
docker_url = f"{registry_uri}/{docker_tag}"
|
|
1016
|
+
command = f"docker tag {docker_tag} {docker_url}"
|
|
1017
|
+
print(PaastaColors.grey(command))
|
|
1018
|
+
retcode, _ = _run(command, stream=True)
|
|
1019
|
+
if retcode != 0:
|
|
1020
|
+
return None
|
|
1021
|
+
|
|
1022
|
+
if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY:
|
|
1023
|
+
command = "sudo -H docker push %s" % docker_url
|
|
1024
|
+
else:
|
|
1025
|
+
command = "docker push %s" % docker_url
|
|
1026
|
+
|
|
1027
|
+
print(PaastaColors.grey(command))
|
|
1028
|
+
retcode, output = _run(command, stream=False)
|
|
1029
|
+
if retcode != 0:
|
|
1030
|
+
return None
|
|
1031
|
+
|
|
1032
|
+
# With unprivileged docker, the digest on the remote registry may not match the digest
|
|
1033
|
+
# in the local environment. Because of this, we have to parse the digest message from the
|
|
1034
|
+
# server response and use downstream when launching spark executors
|
|
1035
|
+
|
|
1036
|
+
# Output from `docker push` with unprivileged docker looks like
|
|
1037
|
+
# Using default tag: latest
|
|
1038
|
+
# The push refers to repository [docker-dev.yelpcorp.com/paasta-spark-run-dpopes:latest]
|
|
1039
|
+
# latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557
|
|
1040
|
+
|
|
1041
|
+
# With privileged docker, the last line has an extra "size: 123"
|
|
1042
|
+
# latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557 size: 52
|
|
1043
|
+
|
|
1044
|
+
digest_line = output.split("\n")[-1]
|
|
1045
|
+
digest_match = re.match(r"[^:]*: [^:]*: (?P<digest>[^\s]*)", digest_line)
|
|
1046
|
+
if not digest_match:
|
|
1047
|
+
raise ValueError(f"Could not determine digest from output: {output}")
|
|
1048
|
+
digest = digest_match.group("digest")
|
|
1049
|
+
|
|
1050
|
+
image_url = f"{docker_url}@{digest}"
|
|
1051
|
+
|
|
1052
|
+
# If the local digest doesn't match the remote digest AND the registry is
|
|
1053
|
+
# non-default (which requires requires authentication, and consequently sudo),
|
|
1054
|
+
# downstream `docker run` commands will fail trying to authenticate.
|
|
1055
|
+
# To work around this, we can proactively `sudo docker pull` here so that
|
|
1056
|
+
# the image exists locally and can be `docker run` without sudo
|
|
1057
|
+
if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY:
|
|
1058
|
+
command = f"sudo -H docker pull {image_url}"
|
|
1059
|
+
print(PaastaColors.grey(command))
|
|
1060
|
+
retcode, output = _run(command, stream=False)
|
|
1061
|
+
if retcode != 0:
|
|
1062
|
+
raise NoDockerImageError(f"Could not pull {image_url}: {output}")
|
|
1063
|
+
|
|
1064
|
+
return image_url
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
def validate_work_dir(s):
|
|
1068
|
+
dirs = s.split(":")
|
|
1069
|
+
if len(dirs) != 2:
|
|
1070
|
+
print(
|
|
1071
|
+
"work-dir %s is not in format local_abs_dir:container_abs_dir" % s,
|
|
1072
|
+
file=sys.stderr,
|
|
1073
|
+
)
|
|
1074
|
+
sys.exit(1)
|
|
1075
|
+
|
|
1076
|
+
for d in dirs:
|
|
1077
|
+
if not os.path.isabs(d):
|
|
1078
|
+
print("%s is not an absolute path" % d, file=sys.stderr)
|
|
1079
|
+
sys.exit(1)
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
def parse_tronfig(tronfig_path: str, job_id: str) -> Optional[Dict[str, Any]]:
|
|
1083
|
+
splitted = job_id.split(".")
|
|
1084
|
+
if len(splitted) != 2:
|
|
1085
|
+
return None
|
|
1086
|
+
job_name, action_name = splitted
|
|
1087
|
+
|
|
1088
|
+
file_content = read_yaml_file(tronfig_path)
|
|
1089
|
+
jobs = filter_templates_from_config(file_content)
|
|
1090
|
+
if job_name not in jobs or action_name not in jobs[job_name].get("actions", {}):
|
|
1091
|
+
return None
|
|
1092
|
+
return jobs[job_name]["actions"][action_name]
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
def update_args_from_tronfig(args: argparse.Namespace) -> Optional[Dict[str, str]]:
|
|
1096
|
+
"""
|
|
1097
|
+
Load and check the following config fields from the provided Tronfig.
|
|
1098
|
+
- executor
|
|
1099
|
+
- pool
|
|
1100
|
+
- iam_role
|
|
1101
|
+
- iam_role_provider
|
|
1102
|
+
- force_spark_resource_configs
|
|
1103
|
+
- max_runtime
|
|
1104
|
+
- command
|
|
1105
|
+
- env
|
|
1106
|
+
- spark_args
|
|
1107
|
+
|
|
1108
|
+
Returns: environment variables dictionary or None if failed.
|
|
1109
|
+
"""
|
|
1110
|
+
action_dict = parse_tronfig(args.tronfig, args.job_id)
|
|
1111
|
+
if action_dict is None:
|
|
1112
|
+
print(
|
|
1113
|
+
PaastaColors.red(f"Unable to get configs from job-id: {args.job_id}"),
|
|
1114
|
+
file=sys.stderr,
|
|
1115
|
+
)
|
|
1116
|
+
return None
|
|
1117
|
+
|
|
1118
|
+
# executor === spark
|
|
1119
|
+
if action_dict.get("executor", "") != "spark":
|
|
1120
|
+
print(
|
|
1121
|
+
PaastaColors.red("Invalid Tronfig: executor should be 'spark'"),
|
|
1122
|
+
file=sys.stderr,
|
|
1123
|
+
)
|
|
1124
|
+
return None
|
|
1125
|
+
|
|
1126
|
+
# iam_role / aws_profile
|
|
1127
|
+
if (
|
|
1128
|
+
"iam_role" in action_dict
|
|
1129
|
+
and action_dict.get("iam_role_provider", "aws") != "aws"
|
|
1130
|
+
):
|
|
1131
|
+
print(
|
|
1132
|
+
PaastaColors.red("Invalid Tronfig: iam_role_provider should be 'aws'"),
|
|
1133
|
+
file=sys.stderr,
|
|
1134
|
+
)
|
|
1135
|
+
return None
|
|
1136
|
+
|
|
1137
|
+
# Other args: map Tronfig YAML fields to spark-run CLI args
|
|
1138
|
+
fields_to_args = {
|
|
1139
|
+
"pool": "pool",
|
|
1140
|
+
"iam_role": "assume_aws_role",
|
|
1141
|
+
"force_spark_resource_configs": "force_spark_resource_configs",
|
|
1142
|
+
"max_runtime": "timeout_job_runtime",
|
|
1143
|
+
"command": "cmd",
|
|
1144
|
+
"spark_args": "spark_args",
|
|
1145
|
+
}
|
|
1146
|
+
for field_name, arg_name in fields_to_args.items():
|
|
1147
|
+
if field_name in action_dict:
|
|
1148
|
+
value = action_dict[field_name]
|
|
1149
|
+
|
|
1150
|
+
# Convert spark_args values from dict to a string "k1=v1 k2=v2"
|
|
1151
|
+
if field_name == "spark_args":
|
|
1152
|
+
value = " ".join([f"{k}={v}" for k, v in dict(value).items()])
|
|
1153
|
+
|
|
1154
|
+
# Beautify for printing
|
|
1155
|
+
arg_name_str = (f"--{arg_name.replace('_', '-')}").ljust(31, " ")
|
|
1156
|
+
|
|
1157
|
+
# Only load iam_role value if --aws-profile is not set
|
|
1158
|
+
if field_name == "iam_role" and args.aws_profile is not None:
|
|
1159
|
+
print(
|
|
1160
|
+
PaastaColors.yellow(
|
|
1161
|
+
f"Ignoring Tronfig: `{field_name} : {value}`, since `--aws-profile` is provided. "
|
|
1162
|
+
f"We are giving higher priority to `--aws-profile` in case of paasta spark-run adhoc runs."
|
|
1163
|
+
),
|
|
1164
|
+
)
|
|
1165
|
+
continue
|
|
1166
|
+
|
|
1167
|
+
if hasattr(args, arg_name):
|
|
1168
|
+
print(
|
|
1169
|
+
PaastaColors.yellow(
|
|
1170
|
+
f"Overwriting args with Tronfig: {arg_name_str} => {field_name} : {value}"
|
|
1171
|
+
),
|
|
1172
|
+
)
|
|
1173
|
+
setattr(args, arg_name, value)
|
|
1174
|
+
|
|
1175
|
+
# env (currently paasta spark-run does not support Spark driver secrets environment variables)
|
|
1176
|
+
return action_dict.get("env", dict())
|
|
1177
|
+
|
|
1178
|
+
|
|
1179
|
+
def paasta_spark_run(args: argparse.Namespace) -> int:
|
|
1180
|
+
if args.get_eks_token_via_iam_user and os.getuid() != 0:
|
|
1181
|
+
print("Re-executing paasta spark-run with sudo..", file=sys.stderr)
|
|
1182
|
+
# argv[0] is treated as command name, so prepending "sudo"
|
|
1183
|
+
os.execvp("sudo", ["sudo"] + sys.argv)
|
|
1184
|
+
return # will not reach unless above function is mocked
|
|
1185
|
+
|
|
1186
|
+
driver_envs_from_tronfig: Dict[str, str] = dict()
|
|
1187
|
+
if args.tronfig is not None:
|
|
1188
|
+
if args.job_id is None:
|
|
1189
|
+
print(
|
|
1190
|
+
PaastaColors.red("Missing --job-id when --tronfig is provided"),
|
|
1191
|
+
file=sys.stderr,
|
|
1192
|
+
)
|
|
1193
|
+
return False
|
|
1194
|
+
driver_envs_from_tronfig = update_args_from_tronfig(args)
|
|
1195
|
+
if driver_envs_from_tronfig is None:
|
|
1196
|
+
return False
|
|
1197
|
+
|
|
1198
|
+
# argparse does not work as expected with both default and
|
|
1199
|
+
# type=validate_work_dir.
|
|
1200
|
+
validate_work_dir(args.work_dir)
|
|
1201
|
+
|
|
1202
|
+
try:
|
|
1203
|
+
system_paasta_config = load_system_paasta_config()
|
|
1204
|
+
except PaastaNotConfiguredError:
|
|
1205
|
+
print(
|
|
1206
|
+
PaastaColors.yellow(
|
|
1207
|
+
"Warning: Couldn't load config files from '/etc/paasta'. This indicates"
|
|
1208
|
+
"PaaSTA is not configured locally on this host, and local-run may not behave"
|
|
1209
|
+
"the same way it would behave on a server configured for PaaSTA."
|
|
1210
|
+
),
|
|
1211
|
+
sep="\n",
|
|
1212
|
+
)
|
|
1213
|
+
system_paasta_config = SystemPaastaConfig({"volumes": []}, "/etc/paasta")
|
|
1214
|
+
|
|
1215
|
+
if args.cmd == "jupyter-lab" and not args.build and not args.image:
|
|
1216
|
+
print(
|
|
1217
|
+
PaastaColors.red(
|
|
1218
|
+
"The jupyter-lab command requires a prebuilt image with -I or --image."
|
|
1219
|
+
),
|
|
1220
|
+
file=sys.stderr,
|
|
1221
|
+
)
|
|
1222
|
+
return 1
|
|
1223
|
+
|
|
1224
|
+
# validate pool
|
|
1225
|
+
try:
|
|
1226
|
+
if not validate_pool(args.cluster, args.pool, system_paasta_config):
|
|
1227
|
+
print(
|
|
1228
|
+
PaastaColors.red(
|
|
1229
|
+
f"Invalid --pool value. List of valid pools for cluster `{args.cluster}`: "
|
|
1230
|
+
f"{system_paasta_config.get_pools_for_cluster(args.cluster)}"
|
|
1231
|
+
),
|
|
1232
|
+
file=sys.stderr,
|
|
1233
|
+
)
|
|
1234
|
+
return 1
|
|
1235
|
+
except PoolsNotConfiguredError:
|
|
1236
|
+
log.warning(
|
|
1237
|
+
PaastaColors.yellow(
|
|
1238
|
+
f"Could not fetch allowed_pools for `{args.cluster}`. Skipping pool validation.\n"
|
|
1239
|
+
)
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
# annoyingly, there's two layers of aliases: one for the soaconfigs to read from
|
|
1243
|
+
# (that's this alias lookup) - and then another layer later when figuring out what
|
|
1244
|
+
# k8s server url to use ;_;
|
|
1245
|
+
cluster = system_paasta_config.get_cluster_aliases().get(args.cluster, args.cluster)
|
|
1246
|
+
# Use the default spark:client instance configs if not provided
|
|
1247
|
+
try:
|
|
1248
|
+
instance_config = get_instance_config(
|
|
1249
|
+
service=args.service,
|
|
1250
|
+
instance=args.instance,
|
|
1251
|
+
cluster=cluster,
|
|
1252
|
+
load_deployments=args.build is False and args.image is None,
|
|
1253
|
+
soa_dir=args.yelpsoa_config_root,
|
|
1254
|
+
)
|
|
1255
|
+
# If the spark job has uses_bulkdata set then propagate it to the instance_config
|
|
1256
|
+
# If not, then whatever the instance_config has will be used
|
|
1257
|
+
if args.uses_bulkdata:
|
|
1258
|
+
instance_config.config_dict["uses_bulkdata"] = args.uses_bulkdata
|
|
1259
|
+
except NoConfigurationForServiceError as e:
|
|
1260
|
+
print(str(e), file=sys.stderr)
|
|
1261
|
+
return 1
|
|
1262
|
+
except NoDeploymentsAvailable:
|
|
1263
|
+
print(
|
|
1264
|
+
PaastaColors.red(
|
|
1265
|
+
"Error: No deployments.json found in %(soa_dir)s/%(service)s."
|
|
1266
|
+
"You can generate this by running:"
|
|
1267
|
+
"generate_deployments_for_service -d %(soa_dir)s -s %(service)s"
|
|
1268
|
+
% {"soa_dir": args.yelpsoa_config_root, "service": args.service}
|
|
1269
|
+
),
|
|
1270
|
+
sep="\n",
|
|
1271
|
+
file=sys.stderr,
|
|
1272
|
+
)
|
|
1273
|
+
return 1
|
|
1274
|
+
|
|
1275
|
+
if not args.cmd and not instance_config.get_cmd():
|
|
1276
|
+
print(
|
|
1277
|
+
"A command is required, pyspark, spark-shell, spark-submit or jupyter",
|
|
1278
|
+
file=sys.stderr,
|
|
1279
|
+
)
|
|
1280
|
+
return 1
|
|
1281
|
+
|
|
1282
|
+
service_account_name = None
|
|
1283
|
+
iam_role = instance_config.get_iam_role()
|
|
1284
|
+
if args.executor_pod_identity and not (iam_role or args.force_pod_identity):
|
|
1285
|
+
print(
|
|
1286
|
+
"--executor-pod-identity set but no iam_role settings found.",
|
|
1287
|
+
file=sys.stderr,
|
|
1288
|
+
)
|
|
1289
|
+
return 1
|
|
1290
|
+
if args.executor_pod_identity:
|
|
1291
|
+
if args.force_pod_identity:
|
|
1292
|
+
if args.yelpsoa_config_root != DEFAULT_SOA_DIR:
|
|
1293
|
+
print(
|
|
1294
|
+
"--force-pod-identity cannot be used with --yelpsoa-config-root",
|
|
1295
|
+
file=sys.stderr,
|
|
1296
|
+
)
|
|
1297
|
+
return 1
|
|
1298
|
+
|
|
1299
|
+
allowed_iam_roles = get_all_iam_roles_for_service(
|
|
1300
|
+
args.service, args.cluster
|
|
1301
|
+
)
|
|
1302
|
+
if args.force_pod_identity not in allowed_iam_roles:
|
|
1303
|
+
print(
|
|
1304
|
+
f"{args.force_pod_identity} is not an allowed role for this service. "
|
|
1305
|
+
f"Allowed roles are: {allowed_iam_roles}.",
|
|
1306
|
+
file=sys.stderr,
|
|
1307
|
+
)
|
|
1308
|
+
return 1
|
|
1309
|
+
|
|
1310
|
+
service_account_name = get_service_account_name(args.force_pod_identity)
|
|
1311
|
+
else:
|
|
1312
|
+
service_account_name = get_service_account_name(iam_role)
|
|
1313
|
+
if (
|
|
1314
|
+
not args.aws_credentials_yaml
|
|
1315
|
+
and not args.aws_profile
|
|
1316
|
+
and not args.assume_aws_role
|
|
1317
|
+
):
|
|
1318
|
+
args.aws_credentials_yaml = (
|
|
1319
|
+
system_paasta_config.get_default_spark_iam_user()
|
|
1320
|
+
)
|
|
1321
|
+
log.info(f"Running executor with service account {service_account_name}")
|
|
1322
|
+
|
|
1323
|
+
aws_creds = get_aws_credentials(
|
|
1324
|
+
service=args.service,
|
|
1325
|
+
aws_credentials_yaml=args.aws_credentials_yaml,
|
|
1326
|
+
profile_name=args.aws_profile,
|
|
1327
|
+
assume_aws_role_arn=args.assume_aws_role,
|
|
1328
|
+
session_duration=args.aws_role_duration,
|
|
1329
|
+
use_web_identity=args.use_web_identity,
|
|
1330
|
+
)
|
|
1331
|
+
|
|
1332
|
+
# If executor pods use a service account, they don't need static aws creds
|
|
1333
|
+
# but the driver still does
|
|
1334
|
+
if service_account_name:
|
|
1335
|
+
executor_aws_creds = None
|
|
1336
|
+
else:
|
|
1337
|
+
executor_aws_creds = aws_creds
|
|
1338
|
+
|
|
1339
|
+
docker_image_digest = get_docker_image(args, instance_config)
|
|
1340
|
+
if docker_image_digest is None:
|
|
1341
|
+
return 1
|
|
1342
|
+
|
|
1343
|
+
volumes = instance_config.get_volumes(
|
|
1344
|
+
system_paasta_config.get_volumes(),
|
|
1345
|
+
)
|
|
1346
|
+
app_base_name = get_spark_app_name(args.cmd or instance_config.get_cmd())
|
|
1347
|
+
|
|
1348
|
+
user_spark_opts = _parse_user_spark_args(args.spark_args)
|
|
1349
|
+
|
|
1350
|
+
args.cmd = auto_add_timeout_for_spark_job(args.cmd, args.timeout_job_runtime)
|
|
1351
|
+
|
|
1352
|
+
# This is required if configs are provided as part of `spark-submit`
|
|
1353
|
+
# Other way to provide is with --spark-args
|
|
1354
|
+
sub_cmds = args.cmd.split(" ") # spark.driver.memory=10g
|
|
1355
|
+
for cmd in sub_cmds:
|
|
1356
|
+
if cmd.startswith("spark.driver.memory") or cmd.startswith(
|
|
1357
|
+
"spark.driver.cores"
|
|
1358
|
+
):
|
|
1359
|
+
key, value = cmd.split("=")
|
|
1360
|
+
user_spark_opts[key] = value
|
|
1361
|
+
|
|
1362
|
+
paasta_instance = get_smart_paasta_instance_name(args)
|
|
1363
|
+
|
|
1364
|
+
k8s_server_address = get_k8s_url_for_cluster(args.cluster)
|
|
1365
|
+
paasta_cluster = system_paasta_config.get_eks_cluster_aliases().get(
|
|
1366
|
+
args.cluster, args.cluster
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
spark_conf_builder = spark_config.SparkConfBuilder()
|
|
1370
|
+
spark_conf = spark_conf_builder.get_spark_conf(
|
|
1371
|
+
cluster_manager=args.cluster_manager,
|
|
1372
|
+
spark_app_base_name=app_base_name,
|
|
1373
|
+
docker_img=docker_image_digest,
|
|
1374
|
+
user_spark_opts=user_spark_opts,
|
|
1375
|
+
paasta_cluster=paasta_cluster,
|
|
1376
|
+
paasta_pool=args.pool,
|
|
1377
|
+
paasta_service=args.service,
|
|
1378
|
+
paasta_instance=paasta_instance,
|
|
1379
|
+
extra_volumes=cast(List[Mapping[str, str]], volumes),
|
|
1380
|
+
aws_creds=executor_aws_creds,
|
|
1381
|
+
aws_region=args.aws_region,
|
|
1382
|
+
force_spark_resource_configs=args.force_spark_resource_configs,
|
|
1383
|
+
use_eks=True,
|
|
1384
|
+
k8s_server_address=k8s_server_address,
|
|
1385
|
+
service_account_name=service_account_name,
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
return configure_and_run_docker_container(
|
|
1389
|
+
args,
|
|
1390
|
+
docker_img=docker_image_digest,
|
|
1391
|
+
instance_config=instance_config,
|
|
1392
|
+
system_paasta_config=system_paasta_config,
|
|
1393
|
+
spark_conf=spark_conf,
|
|
1394
|
+
aws_creds=aws_creds,
|
|
1395
|
+
cluster_manager=args.cluster_manager,
|
|
1396
|
+
pod_template_path=spark_conf.get(
|
|
1397
|
+
"spark.kubernetes.executor.podTemplateFile", ""
|
|
1398
|
+
),
|
|
1399
|
+
extra_driver_envs=driver_envs_from_tronfig,
|
|
1400
|
+
)
|