skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,14 +2,15 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
-
import inspect
|
|
6
5
|
import json
|
|
7
6
|
import math
|
|
8
7
|
import os
|
|
9
8
|
import pathlib
|
|
9
|
+
import random
|
|
10
10
|
import re
|
|
11
11
|
import shlex
|
|
12
12
|
import signal
|
|
13
|
+
import socket
|
|
13
14
|
import subprocess
|
|
14
15
|
import sys
|
|
15
16
|
import tempfile
|
|
@@ -17,8 +18,8 @@ import textwrap
|
|
|
17
18
|
import threading
|
|
18
19
|
import time
|
|
19
20
|
import typing
|
|
20
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
21
|
-
Union)
|
|
21
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
22
|
+
Set, Tuple, Union)
|
|
22
23
|
|
|
23
24
|
import colorama
|
|
24
25
|
import psutil
|
|
@@ -39,6 +40,7 @@ from sky import skypilot_config
|
|
|
39
40
|
from sky import task as task_lib
|
|
40
41
|
from sky.adaptors import common as adaptors_common
|
|
41
42
|
from sky.backends import backend_utils
|
|
43
|
+
from sky.backends import task_codegen
|
|
42
44
|
from sky.backends import wheel_utils
|
|
43
45
|
from sky.clouds import cloud as sky_cloud
|
|
44
46
|
from sky.clouds.utils import gcp_utils
|
|
@@ -48,14 +50,15 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
55
|
+
from sky.serve import constants as serve_constants
|
|
52
56
|
from sky.server.requests import requests as requests_lib
|
|
53
57
|
from sky.skylet import autostop_lib
|
|
54
58
|
from sky.skylet import constants
|
|
55
59
|
from sky.skylet import job_lib
|
|
56
60
|
from sky.skylet import log_lib
|
|
57
61
|
from sky.usage import usage_lib
|
|
58
|
-
from sky.utils import accelerator_registry
|
|
59
62
|
from sky.utils import annotations
|
|
60
63
|
from sky.utils import cluster_utils
|
|
61
64
|
from sky.utils import command_runner
|
|
@@ -85,13 +88,34 @@ if typing.TYPE_CHECKING:
|
|
|
85
88
|
from sky import dag
|
|
86
89
|
from sky.schemas.generated import autostopv1_pb2
|
|
87
90
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
93
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
95
|
+
from sky.schemas.generated import servev1_pb2
|
|
96
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
88
97
|
else:
|
|
89
98
|
# To avoid requiring grpcio to be installed on the client side.
|
|
90
|
-
grpc = adaptors_common.LazyImport(
|
|
99
|
+
grpc = adaptors_common.LazyImport(
|
|
100
|
+
'grpc',
|
|
101
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
102
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
103
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
91
104
|
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
92
105
|
'sky.schemas.generated.autostopv1_pb2')
|
|
93
106
|
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
94
107
|
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
108
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
109
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
110
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
111
|
+
servev1_pb2 = adaptors_common.LazyImport(
|
|
112
|
+
'sky.schemas.generated.servev1_pb2')
|
|
113
|
+
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
114
|
+
'sky.schemas.generated.servev1_pb2_grpc')
|
|
115
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
116
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
117
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
118
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
95
119
|
|
|
96
120
|
Path = str
|
|
97
121
|
|
|
@@ -113,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
|
113
137
|
clouds.OCI: 300,
|
|
114
138
|
clouds.Paperspace: 600,
|
|
115
139
|
clouds.Kubernetes: 300,
|
|
140
|
+
clouds.Shadeform: 300,
|
|
116
141
|
clouds.Vsphere: 240,
|
|
117
142
|
}
|
|
118
143
|
|
|
@@ -167,18 +192,12 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
167
192
|
pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
|
|
168
193
|
'monkey_patches' / 'monkey_patch_ray_up.py')
|
|
169
194
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
# If the command is too long, we instead write it to a file, rsync and execute
|
|
177
|
-
# it.
|
|
178
|
-
#
|
|
179
|
-
# We use 100KB as a threshold to be safe for other arguments that
|
|
180
|
-
# might be added during ssh.
|
|
181
|
-
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
195
|
+
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
196
|
+
('too long', 255),
|
|
197
|
+
('request-uri too large', 1),
|
|
198
|
+
('request header fields too large', 1),
|
|
199
|
+
('400 bad request', 1), # CloudFlare 400 error
|
|
200
|
+
]
|
|
182
201
|
|
|
183
202
|
_RESOURCES_UNAVAILABLE_LOG = (
|
|
184
203
|
'Reasons for provision failures (for details, please check the log above):')
|
|
@@ -187,16 +206,59 @@ _RESOURCES_UNAVAILABLE_LOG = (
|
|
|
187
206
|
_CLUSTER_LOCK_TIMEOUT = 5.0
|
|
188
207
|
|
|
189
208
|
|
|
190
|
-
def
|
|
191
|
-
|
|
209
|
+
def _is_message_too_long(returncode: int,
|
|
210
|
+
output: Optional[str] = None,
|
|
211
|
+
file_path: Optional[str] = None) -> bool:
|
|
212
|
+
"""Check if the message sent to the remote is too long.
|
|
192
213
|
|
|
193
|
-
We
|
|
194
|
-
|
|
195
|
-
|
|
214
|
+
We use inline script to run the setup or run command, i.e. the script will
|
|
215
|
+
be part of the message sent to the remote cluster. There is a chance that
|
|
216
|
+
the command is too long, when people has very long run or setup commands, or
|
|
217
|
+
there is a cloudflare proxy in front of the remote blocking the long
|
|
218
|
+
message. Several common causes are:
|
|
219
|
+
- SSH returning: `too long` in the error message.
|
|
220
|
+
- Cloudflare proxy returning: `414 Request-URI Too Large` or
|
|
221
|
+
`431 Request Header Fields Too Large` error.
|
|
222
|
+
|
|
223
|
+
We use a general length limit check before but it could be inaccurate on
|
|
224
|
+
some systems, e.g. cloudflare proxy, so this is necessary.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
returncode: The return code of the setup command.
|
|
228
|
+
output: The output of the setup command.
|
|
229
|
+
file_path: The path to the setup log file.
|
|
196
230
|
"""
|
|
231
|
+
assert (output is None) != (file_path is None), (
|
|
232
|
+
'Either output or file_path must be provided.', output, file_path)
|
|
233
|
+
to_check = []
|
|
234
|
+
for (match_str,
|
|
235
|
+
desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
|
|
236
|
+
if desired_rc == returncode:
|
|
237
|
+
to_check.append(match_str)
|
|
238
|
+
if not to_check:
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
def _check_output_for_match_str(output: str) -> bool:
|
|
242
|
+
for match_str in to_check:
|
|
243
|
+
if match_str.lower() in output.lower():
|
|
244
|
+
return True
|
|
245
|
+
return False
|
|
197
246
|
|
|
198
|
-
|
|
199
|
-
|
|
247
|
+
if file_path is not None:
|
|
248
|
+
try:
|
|
249
|
+
with open(os.path.expanduser(file_path), 'r',
|
|
250
|
+
encoding='utf-8') as f:
|
|
251
|
+
content = f.read()
|
|
252
|
+
return _check_output_for_match_str(content)
|
|
253
|
+
except Exception as e: # pylint: disable=broad-except
|
|
254
|
+
# We don't crash the setup if we cannot read the log file.
|
|
255
|
+
# Instead, we should retry the setup with dumping the script
|
|
256
|
+
# to a file to be safe.
|
|
257
|
+
logger.debug(f'Failed to read setup log file {file_path}: {e}')
|
|
258
|
+
return True
|
|
259
|
+
else:
|
|
260
|
+
assert output is not None, (output, file_path)
|
|
261
|
+
return _check_output_for_match_str(output)
|
|
200
262
|
|
|
201
263
|
|
|
202
264
|
def _get_cluster_config_template(cloud):
|
|
@@ -208,17 +270,21 @@ def _get_cluster_config_template(cloud):
|
|
|
208
270
|
clouds.Lambda: 'lambda-ray.yml.j2',
|
|
209
271
|
clouds.IBM: 'ibm-ray.yml.j2',
|
|
210
272
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
273
|
+
clouds.Slurm: 'slurm-ray.yml.j2',
|
|
211
274
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
212
275
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
276
|
+
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
213
277
|
clouds.DO: 'do-ray.yml.j2',
|
|
214
278
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
215
279
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
216
280
|
clouds.SSH: 'kubernetes-ray.yml.j2',
|
|
281
|
+
clouds.Shadeform: 'shadeform-ray.yml.j2',
|
|
217
282
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
|
218
283
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
219
284
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
220
285
|
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
221
|
-
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
|
|
286
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
287
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
222
288
|
}
|
|
223
289
|
return cloud_to_template[type(cloud)]
|
|
224
290
|
|
|
@@ -248,511 +314,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
|
|
|
248
314
|
return f.name
|
|
249
315
|
|
|
250
316
|
|
|
251
|
-
class RayCodeGen:
|
|
252
|
-
"""Code generator of a Ray program that executes a sky.Task.
|
|
253
|
-
|
|
254
|
-
Usage:
|
|
255
|
-
|
|
256
|
-
>> codegen = RayCodegen()
|
|
257
|
-
>> codegen.add_prologue()
|
|
258
|
-
|
|
259
|
-
>> codegen.add_ray_task(...)
|
|
260
|
-
>> codegen.add_ray_task(...)
|
|
261
|
-
|
|
262
|
-
>> codegen.add_epilogue()
|
|
263
|
-
>> code = codegen.build()
|
|
264
|
-
"""
|
|
265
|
-
|
|
266
|
-
def __init__(self):
|
|
267
|
-
# Code generated so far, to be joined via '\n'.
|
|
268
|
-
self._code = []
|
|
269
|
-
# Guard method calling order.
|
|
270
|
-
self._has_prologue = False
|
|
271
|
-
self._has_epilogue = False
|
|
272
|
-
|
|
273
|
-
# For n nodes gang scheduling.
|
|
274
|
-
self._has_gang_scheduling = False
|
|
275
|
-
self._num_nodes = 0
|
|
276
|
-
|
|
277
|
-
self._has_register_run_fn = False
|
|
278
|
-
|
|
279
|
-
# job_id
|
|
280
|
-
# Job ID is used to identify the job (also this generated code).
|
|
281
|
-
# It is a int automatically generated by the DB on the cluster
|
|
282
|
-
# and monotonically increasing starting from 1.
|
|
283
|
-
# To generate the job ID, we use the following logic:
|
|
284
|
-
# code = job_lib.JobLibCodeGen.add_job(username,
|
|
285
|
-
# run_timestamp)
|
|
286
|
-
# job_id = get_output(run_on_cluster(code))
|
|
287
|
-
self.job_id = None
|
|
288
|
-
|
|
289
|
-
def add_prologue(self, job_id: int) -> None:
|
|
290
|
-
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
291
|
-
self._has_prologue = True
|
|
292
|
-
self.job_id = job_id
|
|
293
|
-
# Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
|
|
294
|
-
# 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
|
|
295
|
-
# Otherwise, ray will fail to get the placement group because of a bug
|
|
296
|
-
# in ray job.
|
|
297
|
-
ray_address = 'auto'
|
|
298
|
-
self._code = [
|
|
299
|
-
textwrap.dedent(f"""\
|
|
300
|
-
import functools
|
|
301
|
-
import getpass
|
|
302
|
-
import hashlib
|
|
303
|
-
import io
|
|
304
|
-
import os
|
|
305
|
-
import pathlib
|
|
306
|
-
import selectors
|
|
307
|
-
import shlex
|
|
308
|
-
import subprocess
|
|
309
|
-
import sys
|
|
310
|
-
import tempfile
|
|
311
|
-
import textwrap
|
|
312
|
-
import time
|
|
313
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
314
|
-
|
|
315
|
-
# Set the environment variables to avoid deduplicating logs and
|
|
316
|
-
# scheduler events. This should be set in driver code, since we are
|
|
317
|
-
# not using `ray job submit` anymore, and the environment variables
|
|
318
|
-
# from the ray cluster is not inherited.
|
|
319
|
-
os.environ['RAY_DEDUP_LOGS'] = '0'
|
|
320
|
-
os.environ['RAY_SCHEDULER_EVENTS'] = '0'
|
|
321
|
-
|
|
322
|
-
import ray
|
|
323
|
-
import ray.util as ray_util
|
|
324
|
-
|
|
325
|
-
from sky.skylet import autostop_lib
|
|
326
|
-
from sky.skylet import constants
|
|
327
|
-
from sky.skylet import job_lib
|
|
328
|
-
from sky.utils import log_utils
|
|
329
|
-
from sky.utils import subprocess_utils
|
|
330
|
-
|
|
331
|
-
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
332
|
-
|
|
333
|
-
kwargs = dict()
|
|
334
|
-
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
335
|
-
# the directory exists for backward compatibility for the VM
|
|
336
|
-
# launched before #1790.
|
|
337
|
-
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
|
|
338
|
-
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
|
|
339
|
-
ray.init(
|
|
340
|
-
address={ray_address!r},
|
|
341
|
-
namespace='__sky__{job_id}__',
|
|
342
|
-
log_to_driver=True,
|
|
343
|
-
**kwargs
|
|
344
|
-
)
|
|
345
|
-
def get_or_fail(futures, pg) -> List[int]:
|
|
346
|
-
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
347
|
-
if not futures:
|
|
348
|
-
return []
|
|
349
|
-
returncodes = [1] * len(futures)
|
|
350
|
-
# Wait for 1 task to be ready.
|
|
351
|
-
ready = []
|
|
352
|
-
# Keep invoking ray.wait if ready is empty. This is because
|
|
353
|
-
# ray.wait with timeout=None will only wait for 10**6 seconds,
|
|
354
|
-
# which will cause tasks running for more than 12 days to return
|
|
355
|
-
# before becoming ready.
|
|
356
|
-
# (Such tasks are common in serving jobs.)
|
|
357
|
-
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
358
|
-
while not ready:
|
|
359
|
-
ready, unready = ray.wait(futures)
|
|
360
|
-
idx = futures.index(ready[0])
|
|
361
|
-
returncodes[idx] = ray.get(ready[0])
|
|
362
|
-
while unready:
|
|
363
|
-
if returncodes[idx] != 0:
|
|
364
|
-
for task in unready:
|
|
365
|
-
# ray.cancel without force fails to kill tasks.
|
|
366
|
-
# We use force=True to kill unready tasks.
|
|
367
|
-
ray.cancel(task, force=True)
|
|
368
|
-
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
369
|
-
# killed.
|
|
370
|
-
idx = futures.index(task)
|
|
371
|
-
returncodes[idx] = 137
|
|
372
|
-
break
|
|
373
|
-
ready, unready = ray.wait(unready)
|
|
374
|
-
idx = futures.index(ready[0])
|
|
375
|
-
returncodes[idx] = ray.get(ready[0])
|
|
376
|
-
# Remove the placement group after all tasks are done, so that
|
|
377
|
-
# the next job can be scheduled on the released resources
|
|
378
|
-
# immediately.
|
|
379
|
-
ray_util.remove_placement_group(pg)
|
|
380
|
-
sys.stdout.flush()
|
|
381
|
-
return returncodes
|
|
382
|
-
|
|
383
|
-
run_fn = None
|
|
384
|
-
futures = []
|
|
385
|
-
"""),
|
|
386
|
-
# FIXME: This is a hack to make sure that the functions can be found
|
|
387
|
-
# by ray.remote. This should be removed once we have a better way to
|
|
388
|
-
# specify dependencies for ray.
|
|
389
|
-
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
|
390
|
-
inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
|
|
391
|
-
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
|
392
|
-
inspect.getsource(log_lib.process_subprocess_stream),
|
|
393
|
-
inspect.getsource(log_lib.run_with_log),
|
|
394
|
-
inspect.getsource(log_lib.make_task_bash_script),
|
|
395
|
-
inspect.getsource(log_lib.add_ray_env_vars),
|
|
396
|
-
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
397
|
-
'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
|
|
398
|
-
]
|
|
399
|
-
# Currently, the codegen program is/can only be submitted to the head
|
|
400
|
-
# node, due to using job_lib for updating job statuses, and using
|
|
401
|
-
# autostop_lib here.
|
|
402
|
-
self._code.append(
|
|
403
|
-
# Use hasattr to handle backward compatibility.
|
|
404
|
-
# TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
|
|
405
|
-
textwrap.dedent("""\
|
|
406
|
-
if hasattr(autostop_lib, 'set_last_active_time_to_now'):
|
|
407
|
-
autostop_lib.set_last_active_time_to_now()
|
|
408
|
-
"""))
|
|
409
|
-
self._code += [
|
|
410
|
-
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
411
|
-
]
|
|
412
|
-
|
|
413
|
-
def add_gang_scheduling_placement_group_and_setup(
|
|
414
|
-
self,
|
|
415
|
-
num_nodes: int,
|
|
416
|
-
resources_dict: Dict[str, float],
|
|
417
|
-
stable_cluster_internal_ips: List[str],
|
|
418
|
-
env_vars: Dict[str, str],
|
|
419
|
-
setup_cmd: Optional[str] = None,
|
|
420
|
-
setup_log_path: Optional[str] = None,
|
|
421
|
-
) -> None:
|
|
422
|
-
"""Create the gang scheduling placement group for a Task.
|
|
423
|
-
|
|
424
|
-
cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
|
|
425
|
-
variable is assigned in a deterministic order whenever a new task is
|
|
426
|
-
added.
|
|
427
|
-
"""
|
|
428
|
-
assert self._has_prologue, (
|
|
429
|
-
'Call add_prologue() before '
|
|
430
|
-
'add_gang_scheduling_placement_group_and_setup().')
|
|
431
|
-
self._has_gang_scheduling = True
|
|
432
|
-
self._num_nodes = num_nodes
|
|
433
|
-
|
|
434
|
-
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
435
|
-
# Set CPU to avoid ray hanging the resources allocation
|
|
436
|
-
# for remote functions, since the task will request 1 CPU
|
|
437
|
-
# by default.
|
|
438
|
-
task_cpu_demand = resources_dict.pop('CPU')
|
|
439
|
-
|
|
440
|
-
if resources_dict:
|
|
441
|
-
assert len(resources_dict) == 1, (
|
|
442
|
-
'There can only be one type of accelerator per instance. '
|
|
443
|
-
f'Found: {resources_dict}.')
|
|
444
|
-
acc_name, acc_count = list(resources_dict.items())[0]
|
|
445
|
-
gpu_dict = {'GPU': acc_count}
|
|
446
|
-
# gpu_dict should be empty when the accelerator is not GPU.
|
|
447
|
-
# TODO(zongheng,zhanghao): an alternative is to start the remote
|
|
448
|
-
# cluster with custom resource 'GPU': <n> even if the accelerator(s)
|
|
449
|
-
# are not GPU. We opt for the current solution for now.
|
|
450
|
-
if accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
451
|
-
acc_name):
|
|
452
|
-
gpu_dict = {}
|
|
453
|
-
for bundle in bundles:
|
|
454
|
-
bundle.update({
|
|
455
|
-
# Set the GPU to avoid ray hanging the resources allocation
|
|
456
|
-
**gpu_dict,
|
|
457
|
-
})
|
|
458
|
-
|
|
459
|
-
streaming_message = (
|
|
460
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
|
461
|
-
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
|
462
|
-
f'be killed){colorama.Style.RESET_ALL}')
|
|
463
|
-
self._code += [
|
|
464
|
-
textwrap.dedent(f"""\
|
|
465
|
-
pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
|
|
466
|
-
plural = 's' if {num_nodes} > 1 else ''
|
|
467
|
-
node_str = f'{num_nodes} node{{plural}}'
|
|
468
|
-
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
469
|
-
'Waiting for task resources on '
|
|
470
|
-
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
|
471
|
-
print(message, flush=True)
|
|
472
|
-
# FIXME: This will print the error message from autoscaler if
|
|
473
|
-
# it is waiting for other task to finish. We should hide the
|
|
474
|
-
# error message.
|
|
475
|
-
ray.get(pg.ready())
|
|
476
|
-
print({streaming_message!r}, flush=True)
|
|
477
|
-
""")
|
|
478
|
-
]
|
|
479
|
-
|
|
480
|
-
job_id = self.job_id
|
|
481
|
-
if setup_cmd is not None:
|
|
482
|
-
setup_envs = env_vars.copy()
|
|
483
|
-
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
484
|
-
self._code += [
|
|
485
|
-
textwrap.dedent(f"""\
|
|
486
|
-
setup_cmd = {setup_cmd!r}
|
|
487
|
-
_SETUP_CPUS = 0.0001
|
|
488
|
-
# The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
|
|
489
|
-
# requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
|
|
490
|
-
# We unset it so that user setup command may properly use this env var.
|
|
491
|
-
setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
|
|
492
|
-
job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
493
|
-
|
|
494
|
-
# The schedule_step should be called after the job status is set to non-PENDING,
|
|
495
|
-
# otherwise, the scheduler will think the current job is not submitted yet, and
|
|
496
|
-
# skip the scheduling step.
|
|
497
|
-
job_lib.scheduler.schedule_step()
|
|
498
|
-
|
|
499
|
-
total_num_nodes = len(ray.nodes())
|
|
500
|
-
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
501
|
-
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
502
|
-
setup_workers = [run_bash_command_with_log \\
|
|
503
|
-
.options(
|
|
504
|
-
name='setup',
|
|
505
|
-
num_cpus=_SETUP_CPUS,
|
|
506
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
507
|
-
placement_group=setup_pg,
|
|
508
|
-
placement_group_bundle_index=i)
|
|
509
|
-
) \\
|
|
510
|
-
.remote(
|
|
511
|
-
setup_cmd,
|
|
512
|
-
os.path.expanduser({setup_log_path!r}),
|
|
513
|
-
env_vars={setup_envs!r},
|
|
514
|
-
stream_logs=True,
|
|
515
|
-
with_ray=True,
|
|
516
|
-
) for i in range(total_num_nodes)]
|
|
517
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
518
|
-
if sum(setup_returncodes) != 0:
|
|
519
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
520
|
-
# This waits for all streaming logs to finish.
|
|
521
|
-
time.sleep(1)
|
|
522
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
523
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
524
|
-
setup_returncodes,
|
|
525
|
-
flush=True)
|
|
526
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
527
|
-
sys.exit(1)
|
|
528
|
-
""")
|
|
529
|
-
]
|
|
530
|
-
|
|
531
|
-
self._code.append(f'job_lib.set_job_started({self.job_id!r})')
|
|
532
|
-
if setup_cmd is None:
|
|
533
|
-
# Need to call schedule_step() to make sure the scheduler
|
|
534
|
-
# schedule the next pending job.
|
|
535
|
-
self._code.append('job_lib.scheduler.schedule_step()')
|
|
536
|
-
|
|
537
|
-
# Export IP and node rank to the environment variables.
|
|
538
|
-
self._code += [
|
|
539
|
-
textwrap.dedent(f"""\
|
|
540
|
-
@ray.remote
|
|
541
|
-
def check_ip():
|
|
542
|
-
return ray.util.get_node_ip_address()
|
|
543
|
-
gang_scheduling_id_to_ip = ray.get([
|
|
544
|
-
check_ip.options(
|
|
545
|
-
num_cpus={task_cpu_demand},
|
|
546
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
547
|
-
placement_group=pg,
|
|
548
|
-
placement_group_bundle_index=i
|
|
549
|
-
)).remote()
|
|
550
|
-
for i in range(pg.bundle_count)
|
|
551
|
-
])
|
|
552
|
-
|
|
553
|
-
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
|
554
|
-
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
|
555
|
-
job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
|
|
556
|
-
job_ip_list_str = '\\n'.join(job_ip_rank_list)
|
|
557
|
-
"""),
|
|
558
|
-
]
|
|
559
|
-
|
|
560
|
-
def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
|
|
561
|
-
"""Register the run function to be run on the remote cluster.
|
|
562
|
-
|
|
563
|
-
Args:
|
|
564
|
-
run_fn: The run function to be run on the remote cluster.
|
|
565
|
-
"""
|
|
566
|
-
assert self._has_gang_scheduling, (
|
|
567
|
-
'Call add_gang_scheduling_placement_group_and_setup() '
|
|
568
|
-
'before register_run_fn().')
|
|
569
|
-
assert not self._has_register_run_fn, (
|
|
570
|
-
'register_run_fn() called twice?')
|
|
571
|
-
self._has_register_run_fn = True
|
|
572
|
-
|
|
573
|
-
self._code += [
|
|
574
|
-
run_fn,
|
|
575
|
-
f'run_fn = {run_fn_name}',
|
|
576
|
-
]
|
|
577
|
-
|
|
578
|
-
def add_ray_task(self,
|
|
579
|
-
bash_script: Optional[str],
|
|
580
|
-
task_name: Optional[str],
|
|
581
|
-
ray_resources_dict: Dict[str, float],
|
|
582
|
-
log_dir: str,
|
|
583
|
-
env_vars: Optional[Dict[str, str]] = None,
|
|
584
|
-
gang_scheduling_id: int = 0) -> None:
|
|
585
|
-
"""Generates code for a ray remote task that runs a bash command."""
|
|
586
|
-
assert self._has_gang_scheduling, (
|
|
587
|
-
'Call add_gang_scheduling_placement_group_and_setup() before '
|
|
588
|
-
'add_ray_task().')
|
|
589
|
-
assert (not self._has_register_run_fn or
|
|
590
|
-
bash_script is None), ('bash_script should '
|
|
591
|
-
'be None when run_fn is registered.')
|
|
592
|
-
task_cpu_demand = ray_resources_dict.pop('CPU')
|
|
593
|
-
# Build remote_task.options(...)
|
|
594
|
-
# resources=...
|
|
595
|
-
# num_gpus=...
|
|
596
|
-
options = []
|
|
597
|
-
options.append(f'num_cpus={task_cpu_demand}')
|
|
598
|
-
|
|
599
|
-
num_gpus = 0.0
|
|
600
|
-
if ray_resources_dict:
|
|
601
|
-
assert len(ray_resources_dict) == 1, (
|
|
602
|
-
'There can only be one type of accelerator per instance. '
|
|
603
|
-
f'Found: {ray_resources_dict}.')
|
|
604
|
-
num_gpus = list(ray_resources_dict.values())[0]
|
|
605
|
-
options.append(f'resources={json.dumps(ray_resources_dict)}')
|
|
606
|
-
|
|
607
|
-
resources_key = list(ray_resources_dict.keys())[0]
|
|
608
|
-
if not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
609
|
-
resources_key):
|
|
610
|
-
# `num_gpus` should be empty when the accelerator is not GPU.
|
|
611
|
-
# FIXME: use a set of GPU types, instead of 'tpu' in the key.
|
|
612
|
-
|
|
613
|
-
# Passing this ensures that the Ray remote task gets
|
|
614
|
-
# CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
|
|
615
|
-
# would be force-set to empty by Ray.
|
|
616
|
-
options.append(f'num_gpus={num_gpus}')
|
|
617
|
-
options.append(
|
|
618
|
-
'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
|
|
619
|
-
'placement_group=pg, '
|
|
620
|
-
f'placement_group_bundle_index={gang_scheduling_id})')
|
|
621
|
-
|
|
622
|
-
sky_env_vars_dict_str = [
|
|
623
|
-
textwrap.dedent(f"""\
|
|
624
|
-
sky_env_vars_dict = {{}}
|
|
625
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
|
|
626
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
|
|
627
|
-
""")
|
|
628
|
-
]
|
|
629
|
-
|
|
630
|
-
if env_vars is not None:
|
|
631
|
-
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
632
|
-
for k, v in env_vars.items())
|
|
633
|
-
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
634
|
-
|
|
635
|
-
options_str = ', '.join(options)
|
|
636
|
-
logger.debug('Added Task with options: '
|
|
637
|
-
f'{options_str}')
|
|
638
|
-
# Script to block completion of a job until all storage mounted with
|
|
639
|
-
# CACHED_MOUNT mode is uploaded to remote.
|
|
640
|
-
rclone_flush_script = textwrap.dedent(f"""\
|
|
641
|
-
|
|
642
|
-
# Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
|
|
643
|
-
# findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
|
|
644
|
-
# rclone for normal mounts as well.
|
|
645
|
-
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
646
|
-
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
647
|
-
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
648
|
-
flushed=0
|
|
649
|
-
# extra second on top of --vfs-cache-poll-interval to
|
|
650
|
-
# avoid race condition between rclone log line creation and this check.
|
|
651
|
-
sleep 1
|
|
652
|
-
while [ $flushed -eq 0 ]; do
|
|
653
|
-
# sleep for the same interval as --vfs-cache-poll-interval
|
|
654
|
-
sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
|
|
655
|
-
flushed=1
|
|
656
|
-
for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
|
|
657
|
-
exitcode=0
|
|
658
|
-
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
659
|
-
if [ $exitcode -ne 0 ]; then
|
|
660
|
-
echo "skypilot: cached mount is still uploading to remote"
|
|
661
|
-
flushed=0
|
|
662
|
-
break
|
|
663
|
-
fi
|
|
664
|
-
done
|
|
665
|
-
done
|
|
666
|
-
echo "skypilot: cached mount uploaded complete"
|
|
667
|
-
fi""")
|
|
668
|
-
self._code += [
|
|
669
|
-
sky_env_vars_dict_str,
|
|
670
|
-
textwrap.dedent(f"""\
|
|
671
|
-
script = {bash_script!r}
|
|
672
|
-
rclone_flush_script = {rclone_flush_script!r}
|
|
673
|
-
if run_fn is not None:
|
|
674
|
-
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
|
675
|
-
|
|
676
|
-
if script is not None:
|
|
677
|
-
script += rclone_flush_script
|
|
678
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
679
|
-
|
|
680
|
-
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
|
|
681
|
-
rank = job_ip_rank_map[ip]
|
|
682
|
-
|
|
683
|
-
if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
|
|
684
|
-
name_str = '{task_name},' if {task_name!r} != None else 'task,'
|
|
685
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
|
|
686
|
-
else: # Single-node or multi-node task on multi-node cluster
|
|
687
|
-
idx_in_cluster = cluster_ips_to_node_id[ip]
|
|
688
|
-
if cluster_ips_to_node_id[ip] == 0:
|
|
689
|
-
node_name = 'head'
|
|
690
|
-
else:
|
|
691
|
-
node_name = f'worker{{idx_in_cluster}}'
|
|
692
|
-
name_str = f'{{node_name}}, rank={{rank}},'
|
|
693
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
|
|
694
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
|
|
695
|
-
|
|
696
|
-
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
697
|
-
|
|
698
|
-
futures.append(run_bash_command_with_log \\
|
|
699
|
-
.options(name=name_str, {options_str}) \\
|
|
700
|
-
.remote(
|
|
701
|
-
script,
|
|
702
|
-
log_path,
|
|
703
|
-
env_vars=sky_env_vars_dict,
|
|
704
|
-
stream_logs=True,
|
|
705
|
-
with_ray=True,
|
|
706
|
-
))""")
|
|
707
|
-
]
|
|
708
|
-
|
|
709
|
-
def add_epilogue(self) -> None:
|
|
710
|
-
"""Generates code that waits for all tasks, then exits."""
|
|
711
|
-
assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
|
|
712
|
-
assert not self._has_epilogue, 'add_epilogue() called twice?'
|
|
713
|
-
self._has_epilogue = True
|
|
714
|
-
|
|
715
|
-
self._code += [
|
|
716
|
-
textwrap.dedent(f"""\
|
|
717
|
-
returncodes = get_or_fail(futures, pg)
|
|
718
|
-
if sum(returncodes) != 0:
|
|
719
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
720
|
-
# Schedule the next pending job immediately to make the job
|
|
721
|
-
# scheduling more efficient.
|
|
722
|
-
job_lib.scheduler.schedule_step()
|
|
723
|
-
# This waits for all streaming logs to finish.
|
|
724
|
-
time.sleep(0.5)
|
|
725
|
-
reason = ''
|
|
726
|
-
# 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
|
|
727
|
-
if any(r == 139 for r in returncodes):
|
|
728
|
-
reason = '(likely due to Segmentation Fault)'
|
|
729
|
-
if any(r == 137 for r in returncodes):
|
|
730
|
-
# Find the first non-137 return code
|
|
731
|
-
non_137 = next(r for r in returncodes if r != 137)
|
|
732
|
-
reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
|
|
733
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
|
|
734
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
735
|
-
returncodes,
|
|
736
|
-
reason,
|
|
737
|
-
flush=True)
|
|
738
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
739
|
-
sys.exit(1)
|
|
740
|
-
else:
|
|
741
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
|
|
742
|
-
# Schedule the next pending job immediately to make the job
|
|
743
|
-
# scheduling more efficient.
|
|
744
|
-
job_lib.scheduler.schedule_step()
|
|
745
|
-
# This waits for all streaming logs to finish.
|
|
746
|
-
time.sleep(0.5)
|
|
747
|
-
""")
|
|
748
|
-
]
|
|
749
|
-
|
|
750
|
-
def build(self) -> str:
|
|
751
|
-
"""Returns the entire generated program."""
|
|
752
|
-
assert self._has_epilogue, 'Call add_epilogue() before build().'
|
|
753
|
-
return '\n'.join(self._code)
|
|
754
|
-
|
|
755
|
-
|
|
756
317
|
class GangSchedulingStatus(enum.Enum):
|
|
757
318
|
"""Enum for gang scheduling status."""
|
|
758
319
|
CLUSTER_READY = 0
|
|
@@ -1340,6 +901,34 @@ class RetryingVmProvisioner(object):
|
|
|
1340
901
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1341
902
|
yield zones
|
|
1342
903
|
|
|
904
|
+
def _insufficient_resources_msg(
|
|
905
|
+
self,
|
|
906
|
+
to_provision: resources_lib.Resources,
|
|
907
|
+
requested_resources: Set[resources_lib.Resources],
|
|
908
|
+
insufficient_resources: Optional[List[str]],
|
|
909
|
+
) -> str:
|
|
910
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
911
|
+
f' ({", ".join(insufficient_resources)})')
|
|
912
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
913
|
+
if to_provision.zone is not None:
|
|
914
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
915
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
916
|
+
# For public clouds, provision.region is always set.
|
|
917
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
918
|
+
message += (
|
|
919
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
920
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
921
|
+
'have enough resources.')
|
|
922
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
923
|
+
message += (f'in context {to_provision.region} for '
|
|
924
|
+
f'{requested_resources}. ')
|
|
925
|
+
else:
|
|
926
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
927
|
+
f'{requested_resources}. ')
|
|
928
|
+
else:
|
|
929
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
930
|
+
return message
|
|
931
|
+
|
|
1343
932
|
def _retry_zones(
|
|
1344
933
|
self,
|
|
1345
934
|
to_provision: resources_lib.Resources,
|
|
@@ -1418,6 +1007,7 @@ class RetryingVmProvisioner(object):
|
|
|
1418
1007
|
f'To request quotas, check the instruction: '
|
|
1419
1008
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1420
1009
|
|
|
1010
|
+
insufficient_resources = None
|
|
1421
1011
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1422
1012
|
prev_cluster_status,
|
|
1423
1013
|
prev_cluster_ever_up):
|
|
@@ -1630,6 +1220,24 @@ class RetryingVmProvisioner(object):
|
|
|
1630
1220
|
# No teardown happens for this error.
|
|
1631
1221
|
with ux_utils.print_exception_no_traceback():
|
|
1632
1222
|
raise
|
|
1223
|
+
except config_lib.KubernetesError as e:
|
|
1224
|
+
if e.insufficent_resources:
|
|
1225
|
+
insufficient_resources = e.insufficent_resources
|
|
1226
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1227
|
+
# cluster does not exist. Also we are fast at
|
|
1228
|
+
# cleaning up clusters now if there is no existing node.
|
|
1229
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1230
|
+
handle,
|
|
1231
|
+
terminate=not prev_cluster_ever_up,
|
|
1232
|
+
remove_from_db=False,
|
|
1233
|
+
failover=True,
|
|
1234
|
+
)
|
|
1235
|
+
# TODO(suquark): other clouds may have different zone
|
|
1236
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1237
|
+
# for details.
|
|
1238
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1239
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1240
|
+
continue
|
|
1633
1241
|
except Exception as e: # pylint: disable=broad-except
|
|
1634
1242
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1635
1243
|
# cluster does not exist. Also we are fast at
|
|
@@ -1760,26 +1368,9 @@ class RetryingVmProvisioner(object):
|
|
|
1760
1368
|
terminate=terminate_or_stop,
|
|
1761
1369
|
remove_from_db=False)
|
|
1762
1370
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
f'{requested_resources}. ')
|
|
1767
|
-
elif to_provision.region is not None:
|
|
1768
|
-
# For public clouds, provision.region is always set.
|
|
1769
|
-
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1770
|
-
message = ('Failed to acquire resources in SSH Node Pool '
|
|
1771
|
-
f'({to_provision.region.lstrip("ssh-")}) for '
|
|
1772
|
-
f'{requested_resources}. The SSH Node Pool may not '
|
|
1773
|
-
'have enough resources.')
|
|
1774
|
-
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1775
|
-
message = ('Failed to acquire resources in context '
|
|
1776
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1777
|
-
else:
|
|
1778
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1779
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1780
|
-
else:
|
|
1781
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1782
|
-
f'for {requested_resources}. ')
|
|
1371
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1372
|
+
requested_resources,
|
|
1373
|
+
insufficient_resources)
|
|
1783
1374
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1784
1375
|
# the user can have some data on the cluster.
|
|
1785
1376
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -2175,8 +1766,6 @@ class RetryingVmProvisioner(object):
|
|
|
2175
1766
|
# terminated by _retry_zones().
|
|
2176
1767
|
assert (prev_cluster_status == status_lib.ClusterStatus.INIT
|
|
2177
1768
|
), prev_cluster_status
|
|
2178
|
-
assert global_user_state.get_handle_from_cluster_name(
|
|
2179
|
-
cluster_name) is None, cluster_name
|
|
2180
1769
|
logger.info(
|
|
2181
1770
|
ux_utils.retry_message(
|
|
2182
1771
|
f'Retrying provisioning with requested resources: '
|
|
@@ -2215,9 +1804,8 @@ class RetryingVmProvisioner(object):
|
|
|
2215
1804
|
for (resource, exception) in resource_exceptions.items():
|
|
2216
1805
|
table.add_row([
|
|
2217
1806
|
resource.infra.formatted_str(),
|
|
2218
|
-
resources_utils.format_resource(
|
|
2219
|
-
|
|
2220
|
-
exception
|
|
1807
|
+
resources_utils.format_resource(
|
|
1808
|
+
resource, simplified_only=True)[0], exception
|
|
2221
1809
|
])
|
|
2222
1810
|
# Set the max width of REASON column to 80 to avoid the table
|
|
2223
1811
|
# being wrapped in a unreadable way.
|
|
@@ -2239,6 +1827,18 @@ class SSHTunnelInfo:
|
|
|
2239
1827
|
pid: int
|
|
2240
1828
|
|
|
2241
1829
|
|
|
1830
|
+
def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
|
|
1831
|
+
try:
|
|
1832
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1833
|
+
s.settimeout(0.5)
|
|
1834
|
+
s.connect(('localhost', tunnel.port))
|
|
1835
|
+
return True
|
|
1836
|
+
except socket.error as e:
|
|
1837
|
+
logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
|
|
1838
|
+
f'{common_utils.format_exception(e)}')
|
|
1839
|
+
return False
|
|
1840
|
+
|
|
1841
|
+
|
|
2242
1842
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2243
1843
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2244
1844
|
|
|
@@ -2261,8 +1861,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
1861
|
- (optional) Skylet SSH tunnel info.
|
|
2262
1862
|
"""
|
|
2263
1863
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2264
|
-
#
|
|
2265
|
-
_VERSION =
|
|
1864
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
1865
|
+
_VERSION = 12
|
|
2266
1866
|
|
|
2267
1867
|
def __init__(
|
|
2268
1868
|
self,
|
|
@@ -2296,7 +1896,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2296
1896
|
self.launched_resources = launched_resources
|
|
2297
1897
|
self.docker_user: Optional[str] = None
|
|
2298
1898
|
self.is_grpc_enabled = True
|
|
2299
|
-
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2300
1899
|
|
|
2301
1900
|
def __repr__(self):
|
|
2302
1901
|
return (f'ResourceHandle('
|
|
@@ -2313,12 +1912,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2313
1912
|
f'{self.launched_resources}, '
|
|
2314
1913
|
f'\n\tdocker_user={self.docker_user},'
|
|
2315
1914
|
f'\n\tssh_user={self.ssh_user},'
|
|
2316
|
-
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2317
|
-
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
1915
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2318
1916
|
|
|
2319
1917
|
def get_cluster_name(self):
|
|
2320
1918
|
return self.cluster_name
|
|
2321
1919
|
|
|
1920
|
+
def get_cluster_name_on_cloud(self):
|
|
1921
|
+
return self.cluster_name_on_cloud
|
|
1922
|
+
|
|
2322
1923
|
def _use_internal_ips(self):
|
|
2323
1924
|
"""Returns whether to use internal IPs for SSH connections."""
|
|
2324
1925
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
|
@@ -2345,7 +1946,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2345
1946
|
def _update_cluster_info(self):
|
|
2346
1947
|
# When a cluster is on a cloud that does not support the new
|
|
2347
1948
|
# provisioner, we should skip updating cluster_info.
|
|
2348
|
-
if (self.launched_resources.cloud
|
|
1949
|
+
if (self.launched_resources.cloud is not None and
|
|
1950
|
+
self.launched_resources.cloud.PROVISIONER_VERSION >=
|
|
2349
1951
|
clouds.ProvisionerVersion.SKYPILOT):
|
|
2350
1952
|
provider_name = str(self.launched_resources.cloud).lower()
|
|
2351
1953
|
config = {}
|
|
@@ -2643,64 +2245,199 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2643
2245
|
cluster_config_file)
|
|
2644
2246
|
self.docker_user = docker_user
|
|
2645
2247
|
|
|
2248
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2249
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2250
|
+
self.cluster_name)
|
|
2251
|
+
if metadata is None:
|
|
2252
|
+
return None
|
|
2253
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2254
|
+
|
|
2255
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2256
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2257
|
+
self.cluster_name,
|
|
2258
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2259
|
+
|
|
2260
|
+
def close_skylet_ssh_tunnel(self) -> None:
|
|
2261
|
+
"""Terminate the SSH tunnel process and clear its metadata."""
|
|
2262
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2263
|
+
if tunnel is None:
|
|
2264
|
+
return
|
|
2265
|
+
logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
|
|
2266
|
+
self.cluster_name, tunnel.port)
|
|
2267
|
+
try:
|
|
2268
|
+
self._terminate_ssh_tunnel_process(tunnel)
|
|
2269
|
+
finally:
|
|
2270
|
+
self._set_skylet_ssh_tunnel(None)
|
|
2271
|
+
|
|
2646
2272
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2273
|
+
grpc_options = [
|
|
2274
|
+
# The task YAMLs can be large, so the default
|
|
2275
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2276
|
+
('grpc.max_receive_message_length', -1),
|
|
2277
|
+
]
|
|
2278
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2279
|
+
# and writes are very rare.
|
|
2280
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2281
|
+
# because it will only happen on:
|
|
2282
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2283
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2284
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2285
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2286
|
+
# and get the new tunnel.
|
|
2287
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2288
|
+
if tunnel is not None:
|
|
2289
|
+
if _is_tunnel_healthy(tunnel):
|
|
2290
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2291
|
+
options=grpc_options)
|
|
2292
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2293
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2294
|
+
|
|
2295
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2296
|
+
remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2297
|
+
start_time = time.perf_counter()
|
|
2298
|
+
attempt = 1
|
|
2299
|
+
|
|
2300
|
+
def _get_remaining_timeout() -> float:
|
|
2301
|
+
return max(0.0,
|
|
2302
|
+
remaining_timeout - (time.perf_counter() - start_time))
|
|
2303
|
+
|
|
2304
|
+
while remaining_timeout > 0:
|
|
2305
|
+
logger.debug(
|
|
2306
|
+
'Attempting to acquire exclusive lock for %s (attempt %d)',
|
|
2307
|
+
lock_id, attempt)
|
|
2308
|
+
exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
|
|
2309
|
+
try:
|
|
2310
|
+
with exclusive_lock.acquire(blocking=False):
|
|
2311
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2312
|
+
logger.debug(f'Acquired exclusive lock for {lock_id} after '
|
|
2313
|
+
f'{wait_elapsed:.2f}s')
|
|
2314
|
+
try:
|
|
2315
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2316
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2317
|
+
options=grpc_options)
|
|
2318
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2319
|
+
# Failed to open tunnel, release the lock and retry.
|
|
2320
|
+
logger.warning(f'Failed to open tunnel for cluster '
|
|
2321
|
+
f'{self.cluster_name!r}: '
|
|
2322
|
+
f'{common_utils.format_exception(e)}')
|
|
2323
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2324
|
+
attempt += 1
|
|
2325
|
+
continue
|
|
2326
|
+
except locks.LockTimeout:
|
|
2327
|
+
pass
|
|
2651
2328
|
|
|
2652
|
-
|
|
2653
|
-
|
|
2329
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2330
|
+
logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
|
|
2331
|
+
f'waiting on shared lock (attempt {attempt})')
|
|
2332
|
+
try:
|
|
2333
|
+
# Use shared lock so that concurrent readers can
|
|
2334
|
+
# proceed in parallel.
|
|
2335
|
+
shared_lock = locks.get_lock(lock_id,
|
|
2336
|
+
remaining_timeout,
|
|
2337
|
+
shared_lock=True)
|
|
2338
|
+
# Wait for the exclusive lock to be released.
|
|
2339
|
+
shared_lock.acquire(blocking=True)
|
|
2340
|
+
# We only need the lock for signalling that the new tunnel has
|
|
2341
|
+
# been opened, not for checking the tunnel health.
|
|
2342
|
+
# Same reasoning as why we don't need to grab the lock in
|
|
2343
|
+
# the fast path at the start of this function.
|
|
2344
|
+
shared_lock.release()
|
|
2345
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2346
|
+
logger.debug(f'Acquired shared lock for {lock_id} after '
|
|
2347
|
+
f'{wait_elapsed:.2f}s')
|
|
2348
|
+
except locks.LockTimeout as e:
|
|
2349
|
+
raise RuntimeError(
|
|
2350
|
+
f'Failed to get gRPC channel for cluster '
|
|
2351
|
+
f'{self.cluster_name!r} due to a timeout when waiting '
|
|
2352
|
+
'for the SSH tunnel to be opened. Please try again or '
|
|
2353
|
+
f'manually remove the lock at {lock_id}. '
|
|
2354
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2355
|
+
|
|
2356
|
+
# Add small jitter before probing to smoothen the effects
|
|
2357
|
+
# of many readers waking up simultaneously.
|
|
2358
|
+
jitter = random.uniform(0.01, 0.05)
|
|
2359
|
+
time.sleep(jitter)
|
|
2360
|
+
|
|
2361
|
+
# Re-read the tunnel metadata and verify it's healthy.
|
|
2362
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2363
|
+
if tunnel is not None:
|
|
2364
|
+
if _is_tunnel_healthy(tunnel):
|
|
2365
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2366
|
+
options=grpc_options)
|
|
2367
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2368
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2369
|
+
# Tunnel is still unhealthy or missing, try again with updated
|
|
2370
|
+
# timeout. This could happen in the case where the thread who
|
|
2371
|
+
# held the exclusive lock to open the tunnel crashed.
|
|
2372
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2373
|
+
attempt += 1
|
|
2374
|
+
raise RuntimeError('Timeout waiting for gRPC channel for cluster '
|
|
2375
|
+
f'{self.cluster_name!r} to be ready.')
|
|
2376
|
+
|
|
2377
|
+
def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2378
|
+
"""Terminate the SSH tunnel process."""
|
|
2654
2379
|
try:
|
|
2655
2380
|
proc = psutil.Process(tunnel_info.pid)
|
|
2656
2381
|
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2657
2382
|
logger.debug(
|
|
2658
2383
|
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2659
|
-
proc.
|
|
2660
|
-
try:
|
|
2661
|
-
proc.wait(timeout=3)
|
|
2662
|
-
except psutil.TimeoutExpired:
|
|
2663
|
-
proc.kill()
|
|
2664
|
-
proc.wait(timeout=1)
|
|
2384
|
+
subprocess_utils.kill_children_processes(proc.pid)
|
|
2665
2385
|
except psutil.NoSuchProcess:
|
|
2666
2386
|
pass
|
|
2667
2387
|
except Exception as e: # pylint: disable=broad-except
|
|
2668
2388
|
logger.warning(
|
|
2669
2389
|
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2670
2390
|
|
|
2671
|
-
def
|
|
2391
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2672
2392
|
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2673
2393
|
updates the cluster handle, and persists it to the database."""
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2394
|
+
max_attempts = 3
|
|
2395
|
+
# There could be a race condition here, as multiple processes may
|
|
2396
|
+
# attempt to open the same port at the same time.
|
|
2397
|
+
for attempt in range(max_attempts):
|
|
2398
|
+
runners = self.get_command_runners()
|
|
2399
|
+
head_runner = runners[0]
|
|
2400
|
+
local_port = random.randint(10000, 65535)
|
|
2401
|
+
try:
|
|
2402
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2403
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2404
|
+
except exceptions.CommandError as e:
|
|
2405
|
+
# Don't retry if the error is due to timeout,
|
|
2406
|
+
# connection refused, Kubernetes pods not found,
|
|
2407
|
+
# or an in-progress termination.
|
|
2408
|
+
if (e.detailed_reason is not None and
|
|
2409
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2410
|
+
e.detailed_reason) or
|
|
2411
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2412
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2413
|
+
raise e
|
|
2414
|
+
logger.warning(
|
|
2415
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2416
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2417
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2418
|
+
continue
|
|
2419
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2420
|
+
pid=ssh_tunnel_proc.pid)
|
|
2421
|
+
break
|
|
2422
|
+
|
|
2687
2423
|
try:
|
|
2688
2424
|
grpc.channel_ready_future(
|
|
2689
2425
|
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2690
2426
|
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2691
2427
|
# Clean up existing tunnel before setting up the new one.
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2428
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2429
|
+
if old_tunnel is not None:
|
|
2430
|
+
self._terminate_ssh_tunnel_process(old_tunnel)
|
|
2431
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2432
|
+
return tunnel_info
|
|
2696
2433
|
except grpc.FutureTimeoutError as e:
|
|
2697
|
-
self.
|
|
2434
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2698
2435
|
logger.warning(
|
|
2699
2436
|
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2700
2437
|
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2701
2438
|
raise e
|
|
2702
2439
|
except Exception as e:
|
|
2703
|
-
self.
|
|
2440
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2704
2441
|
raise e
|
|
2705
2442
|
|
|
2706
2443
|
@property
|
|
@@ -2713,6 +2450,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2713
2450
|
def cluster_yaml(self, value: Optional[str]):
|
|
2714
2451
|
self._cluster_yaml = value
|
|
2715
2452
|
|
|
2453
|
+
@property
|
|
2454
|
+
def instance_ids(self):
|
|
2455
|
+
if self.cached_cluster_info is not None:
|
|
2456
|
+
return self.cached_cluster_info.instance_ids()
|
|
2457
|
+
return None
|
|
2458
|
+
|
|
2716
2459
|
@property
|
|
2717
2460
|
def ssh_user(self):
|
|
2718
2461
|
if self.cached_cluster_info is not None:
|
|
@@ -2750,7 +2493,16 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2750
2493
|
@property
|
|
2751
2494
|
def is_grpc_enabled_with_flag(self) -> bool:
|
|
2752
2495
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2753
|
-
return env_options.Options.ENABLE_GRPC.get() and
|
|
2496
|
+
return (env_options.Options.ENABLE_GRPC.get() and
|
|
2497
|
+
self.is_grpc_enabled and
|
|
2498
|
+
not isinstance(self.launched_resources.cloud, clouds.Slurm))
|
|
2499
|
+
|
|
2500
|
+
def __getstate__(self):
|
|
2501
|
+
state = self.__dict__.copy()
|
|
2502
|
+
# For backwards compatibility. Refer to
|
|
2503
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
2504
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
2505
|
+
return state
|
|
2754
2506
|
|
|
2755
2507
|
def __setstate__(self, state):
|
|
2756
2508
|
self._version = self._VERSION
|
|
@@ -2809,6 +2561,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2809
2561
|
state['is_grpc_enabled'] = False
|
|
2810
2562
|
state['skylet_ssh_tunnel'] = None
|
|
2811
2563
|
|
|
2564
|
+
if version >= 12:
|
|
2565
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
2566
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
2567
|
+
|
|
2812
2568
|
self.__dict__.update(state)
|
|
2813
2569
|
|
|
2814
2570
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2886,21 +2642,180 @@ class SkyletClient:
|
|
|
2886
2642
|
|
|
2887
2643
|
def __init__(self, channel: 'grpc.Channel'):
|
|
2888
2644
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
2645
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
2646
|
+
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
2647
|
+
self._managed_jobs_stub = (
|
|
2648
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
2889
2649
|
|
|
2890
2650
|
def set_autostop(
|
|
2891
2651
|
self,
|
|
2892
2652
|
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2893
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2653
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2894
2654
|
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2895
2655
|
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2896
2656
|
|
|
2897
2657
|
def is_autostopping(
|
|
2898
2658
|
self,
|
|
2899
2659
|
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2900
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2660
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2901
2661
|
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2902
2662
|
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2903
2663
|
|
|
2664
|
+
def add_job(
|
|
2665
|
+
self,
|
|
2666
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
2667
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2668
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
2669
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
2670
|
+
|
|
2671
|
+
def queue_job(
|
|
2672
|
+
self,
|
|
2673
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
2674
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2675
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
2676
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
2677
|
+
|
|
2678
|
+
def update_status(
|
|
2679
|
+
self,
|
|
2680
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
2681
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2682
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
2683
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
2684
|
+
|
|
2685
|
+
def get_job_queue(
|
|
2686
|
+
self,
|
|
2687
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
2688
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2689
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
2690
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
2691
|
+
|
|
2692
|
+
def cancel_jobs(
|
|
2693
|
+
self,
|
|
2694
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
2695
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2696
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
2697
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2698
|
+
|
|
2699
|
+
def fail_all_in_progress_jobs(
|
|
2700
|
+
self,
|
|
2701
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
2702
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2703
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
2704
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
2705
|
+
|
|
2706
|
+
def get_job_status(
|
|
2707
|
+
self,
|
|
2708
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
2709
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2710
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
2711
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
2712
|
+
|
|
2713
|
+
def get_job_submitted_timestamp(
|
|
2714
|
+
self,
|
|
2715
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
2716
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2717
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
2718
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
2719
|
+
timeout=timeout)
|
|
2720
|
+
|
|
2721
|
+
def get_job_ended_timestamp(
|
|
2722
|
+
self,
|
|
2723
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
2724
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2725
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
2726
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
2727
|
+
|
|
2728
|
+
def get_log_dirs_for_jobs(
|
|
2729
|
+
self,
|
|
2730
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
2731
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2732
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
2733
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
2734
|
+
|
|
2735
|
+
def tail_logs(
|
|
2736
|
+
self,
|
|
2737
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
2738
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2739
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
2740
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
2741
|
+
|
|
2742
|
+
def get_service_status(
|
|
2743
|
+
self,
|
|
2744
|
+
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
2745
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2746
|
+
) -> 'servev1_pb2.GetServiceStatusResponse':
|
|
2747
|
+
return self._serve_stub.GetServiceStatus(request, timeout=timeout)
|
|
2748
|
+
|
|
2749
|
+
def add_serve_version(
|
|
2750
|
+
self,
|
|
2751
|
+
request: 'servev1_pb2.AddVersionRequest',
|
|
2752
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2753
|
+
) -> 'servev1_pb2.AddVersionResponse':
|
|
2754
|
+
return self._serve_stub.AddVersion(request, timeout=timeout)
|
|
2755
|
+
|
|
2756
|
+
def terminate_services(
|
|
2757
|
+
self,
|
|
2758
|
+
request: 'servev1_pb2.TerminateServicesRequest',
|
|
2759
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2760
|
+
) -> 'servev1_pb2.TerminateServicesResponse':
|
|
2761
|
+
return self._serve_stub.TerminateServices(request, timeout=timeout)
|
|
2762
|
+
|
|
2763
|
+
def terminate_replica(
|
|
2764
|
+
self,
|
|
2765
|
+
request: 'servev1_pb2.TerminateReplicaRequest',
|
|
2766
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2767
|
+
) -> 'servev1_pb2.TerminateReplicaResponse':
|
|
2768
|
+
return self._serve_stub.TerminateReplica(request, timeout=timeout)
|
|
2769
|
+
|
|
2770
|
+
def wait_service_registration(
|
|
2771
|
+
self,
|
|
2772
|
+
request: 'servev1_pb2.WaitServiceRegistrationRequest',
|
|
2773
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2774
|
+
) -> 'servev1_pb2.WaitServiceRegistrationResponse':
|
|
2775
|
+
# set timeout to at least 10 seconds more than service register
|
|
2776
|
+
# constant to make sure that timeouts will not occur.
|
|
2777
|
+
if timeout is not None:
|
|
2778
|
+
timeout = max(timeout,
|
|
2779
|
+
serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
|
|
2780
|
+
return self._serve_stub.WaitServiceRegistration(request,
|
|
2781
|
+
timeout=timeout)
|
|
2782
|
+
|
|
2783
|
+
def update_service(
|
|
2784
|
+
self,
|
|
2785
|
+
request: 'servev1_pb2.UpdateServiceRequest',
|
|
2786
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2787
|
+
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
2788
|
+
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
2789
|
+
|
|
2790
|
+
def get_managed_job_controller_version(
|
|
2791
|
+
self,
|
|
2792
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
2793
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2794
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
2795
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
2796
|
+
|
|
2797
|
+
def get_managed_job_table(
|
|
2798
|
+
self,
|
|
2799
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
2800
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2801
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
2802
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
2803
|
+
|
|
2804
|
+
def get_all_managed_job_ids_by_name(
|
|
2805
|
+
self,
|
|
2806
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
2807
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2808
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
2809
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
2810
|
+
timeout=timeout)
|
|
2811
|
+
|
|
2812
|
+
def cancel_managed_jobs(
|
|
2813
|
+
self,
|
|
2814
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
2815
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2816
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
2817
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2818
|
+
|
|
2904
2819
|
|
|
2905
2820
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2906
2821
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -2931,6 +2846,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2931
2846
|
self._requested_features = set()
|
|
2932
2847
|
self._dump_final_script = False
|
|
2933
2848
|
self._is_managed = False
|
|
2849
|
+
# Optional planner (via register_info): used under the per-cluster lock
|
|
2850
|
+
# to produce a fresh concrete plan when neither a reusable snapshot nor
|
|
2851
|
+
# a caller plan is available.
|
|
2852
|
+
self._planner = None
|
|
2934
2853
|
|
|
2935
2854
|
# Command for running the setup script. It is only set when the
|
|
2936
2855
|
# setup needs to be run outside the self._setup() and as part of
|
|
@@ -2948,6 +2867,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2948
2867
|
self._requested_features)
|
|
2949
2868
|
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
|
2950
2869
|
self._is_managed = kwargs.pop('is_managed', False)
|
|
2870
|
+
# Optional planner callback for a fresh plan under lock when no
|
|
2871
|
+
# reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
|
|
2872
|
+
self._planner = kwargs.pop('planner', self._planner)
|
|
2951
2873
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
|
2952
2874
|
|
|
2953
2875
|
def check_resources_fit_cluster(
|
|
@@ -2974,9 +2896,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2974
2896
|
# Usage Collection:
|
|
2975
2897
|
usage_lib.messages.usage.update_cluster_resources(
|
|
2976
2898
|
handle.launched_nodes, launched_resources)
|
|
2977
|
-
|
|
2978
|
-
if
|
|
2979
|
-
usage_lib.messages.usage.update_cluster_status(
|
|
2899
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
2900
|
+
if status is not None:
|
|
2901
|
+
usage_lib.messages.usage.update_cluster_status(status)
|
|
2980
2902
|
|
|
2981
2903
|
assert launched_resources.region is not None, handle
|
|
2982
2904
|
|
|
@@ -3115,7 +3037,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3115
3037
|
colorama.Style.RESET_ALL +
|
|
3116
3038
|
colorama.Style.DIM +
|
|
3117
3039
|
'Check concurrent requests: ' +
|
|
3118
|
-
'sky api status '
|
|
3040
|
+
'sky api status -v | grep '
|
|
3041
|
+
f'{cluster_name}'))
|
|
3119
3042
|
|
|
3120
3043
|
def _locked_provision(
|
|
3121
3044
|
self,
|
|
@@ -3172,8 +3095,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3172
3095
|
try:
|
|
3173
3096
|
retry_provisioner = RetryingVmProvisioner(
|
|
3174
3097
|
self.log_dir,
|
|
3175
|
-
self._dag,
|
|
3176
|
-
self._optimize_target,
|
|
3098
|
+
self._dag, # type: ignore[arg-type]
|
|
3099
|
+
self._optimize_target, # type: ignore[arg-type]
|
|
3177
3100
|
self._requested_features,
|
|
3178
3101
|
local_wheel_path,
|
|
3179
3102
|
wheel_hash,
|
|
@@ -3204,9 +3127,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3204
3127
|
gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
|
|
3205
3128
|
retry_message = ux_utils.retry_message(
|
|
3206
3129
|
f'Retry after {gap_seconds:.0f}s ')
|
|
3207
|
-
hint_message = (
|
|
3208
|
-
|
|
3209
|
-
|
|
3130
|
+
hint_message = (
|
|
3131
|
+
f'\n{retry_message} '
|
|
3132
|
+
f'{ux_utils.provision_hint(cluster_name)}'
|
|
3133
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3210
3134
|
|
|
3211
3135
|
# Add cluster event for retry.
|
|
3212
3136
|
global_user_state.add_cluster_event(
|
|
@@ -3235,7 +3159,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3235
3159
|
logger.error(
|
|
3236
3160
|
ux_utils.error_message(
|
|
3237
3161
|
'Failed to provision resources. '
|
|
3238
|
-
f'{ux_utils.
|
|
3162
|
+
f'{ux_utils.provision_hint(cluster_name)}'))
|
|
3239
3163
|
error_message += (
|
|
3240
3164
|
'\nTo keep retrying until the cluster is up, use '
|
|
3241
3165
|
'the `--retry-until-up` flag.')
|
|
@@ -3244,8 +3168,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3244
3168
|
error_message + '\n' + str(e),
|
|
3245
3169
|
failover_history=e.failover_history) from None
|
|
3246
3170
|
if dryrun:
|
|
3247
|
-
|
|
3248
|
-
|
|
3171
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3172
|
+
cluster_name)
|
|
3173
|
+
return handle if handle is not None else None, False
|
|
3249
3174
|
|
|
3250
3175
|
if config_dict['provisioning_skipped']:
|
|
3251
3176
|
# Skip further provisioning.
|
|
@@ -3253,10 +3178,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3253
3178
|
# ('handle', 'provision_record', 'resources_vars')
|
|
3254
3179
|
# We need to return the handle - but it should be the existing
|
|
3255
3180
|
# handle for the cluster.
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
return
|
|
3181
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3182
|
+
cluster_name)
|
|
3183
|
+
assert handle is not None, (cluster_name, handle)
|
|
3184
|
+
return handle, True
|
|
3260
3185
|
|
|
3261
3186
|
if 'provision_record' in config_dict:
|
|
3262
3187
|
# New provisioner is used here.
|
|
@@ -3279,7 +3204,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3279
3204
|
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3280
3205
|
|
|
3281
3206
|
cluster_info = provisioner.post_provision_runtime_setup(
|
|
3282
|
-
|
|
3207
|
+
handle.launched_resources,
|
|
3283
3208
|
resources_utils.ClusterName(handle.cluster_name,
|
|
3284
3209
|
handle.cluster_name_on_cloud),
|
|
3285
3210
|
handle.cluster_yaml,
|
|
@@ -3293,6 +3218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3293
3218
|
# manually or by the cloud provider.
|
|
3294
3219
|
# Optimize the case where the cluster's IPs can be retrieved
|
|
3295
3220
|
# from cluster_info.
|
|
3221
|
+
handle.cached_cluster_info = cluster_info
|
|
3296
3222
|
handle.docker_user = cluster_info.docker_user
|
|
3297
3223
|
handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
|
|
3298
3224
|
cluster_info=cluster_info)
|
|
@@ -3304,7 +3230,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3304
3230
|
|
|
3305
3231
|
self._update_after_cluster_provisioned(
|
|
3306
3232
|
handle, to_provision_config.prev_handle, task,
|
|
3307
|
-
prev_cluster_status,
|
|
3233
|
+
prev_cluster_status, config_hash)
|
|
3308
3234
|
return handle, False
|
|
3309
3235
|
|
|
3310
3236
|
cluster_config_file = config_dict['ray']
|
|
@@ -3376,7 +3302,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3376
3302
|
|
|
3377
3303
|
self._update_after_cluster_provisioned(
|
|
3378
3304
|
handle, to_provision_config.prev_handle, task,
|
|
3379
|
-
prev_cluster_status,
|
|
3305
|
+
prev_cluster_status, config_hash)
|
|
3380
3306
|
return handle, False
|
|
3381
3307
|
|
|
3382
3308
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -3394,7 +3320,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3394
3320
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3395
3321
|
task: task_lib.Task,
|
|
3396
3322
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3397
|
-
|
|
3323
|
+
config_hash: str) -> None:
|
|
3398
3324
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3399
3325
|
handle.launched_nodes, handle.launched_resources)
|
|
3400
3326
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3406,16 +3332,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3406
3332
|
# update_status will query the ray job status for all INIT /
|
|
3407
3333
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3408
3334
|
# know the actual previous status of the cluster.
|
|
3409
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3410
3335
|
logger.debug('Update job queue on remote cluster.')
|
|
3411
3336
|
with rich_utils.safe_status(
|
|
3412
3337
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3338
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3339
|
+
|
|
3340
|
+
if not use_legacy:
|
|
3341
|
+
try:
|
|
3342
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3343
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3344
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3345
|
+
).update_status(request))
|
|
3346
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3347
|
+
use_legacy = True
|
|
3348
|
+
|
|
3349
|
+
if use_legacy:
|
|
3350
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3351
|
+
returncode, _, stderr = self.run_on_head(
|
|
3352
|
+
handle, cmd, require_outputs=True)
|
|
3353
|
+
subprocess_utils.handle_returncode(
|
|
3354
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3419
3355
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3420
3356
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3421
3357
|
# is restarted
|
|
@@ -3423,14 +3359,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3423
3359
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3424
3360
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3425
3361
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3362
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3363
|
+
|
|
3364
|
+
if not use_legacy:
|
|
3365
|
+
try:
|
|
3366
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3367
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3368
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3369
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3370
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3371
|
+
use_legacy = True
|
|
3372
|
+
|
|
3373
|
+
if use_legacy:
|
|
3374
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3375
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3376
|
+
handle, cmd, require_outputs=True)
|
|
3377
|
+
subprocess_utils.handle_returncode(
|
|
3378
|
+
returncode, cmd,
|
|
3379
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3380
|
+
stdout + stderr)
|
|
3434
3381
|
|
|
3435
3382
|
prev_ports = None
|
|
3436
3383
|
if prev_handle is not None:
|
|
@@ -3485,8 +3432,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3485
3432
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3486
3433
|
handle.ssh_user)
|
|
3487
3434
|
|
|
3488
|
-
locks.get_lock(lock_id).force_unlock()
|
|
3489
|
-
|
|
3490
3435
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3491
3436
|
workdir: Union[Path, Dict[str, Any]],
|
|
3492
3437
|
envs_and_secrets: Dict[str, str]) -> None:
|
|
@@ -3618,8 +3563,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3618
3563
|
self._set_storage_mounts_metadata(handle.cluster_name,
|
|
3619
3564
|
storage_mounts)
|
|
3620
3565
|
|
|
3566
|
+
def _get_num_gpus(self, task: task_lib.Task) -> int:
|
|
3567
|
+
if task.resources is not None:
|
|
3568
|
+
for resource in task.resources:
|
|
3569
|
+
if (resource.accelerators is not None and
|
|
3570
|
+
isinstance(resource.accelerators, dict)):
|
|
3571
|
+
if len(resource.accelerators) > 0:
|
|
3572
|
+
return math.ceil(
|
|
3573
|
+
list(resource.accelerators.values())[0])
|
|
3574
|
+
return 0
|
|
3575
|
+
|
|
3621
3576
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
|
3622
3577
|
detach_setup: bool) -> None:
|
|
3578
|
+
|
|
3623
3579
|
start = time.time()
|
|
3624
3580
|
|
|
3625
3581
|
if task.setup is None:
|
|
@@ -3630,13 +3586,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3630
3586
|
remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
|
|
3631
3587
|
# Need this `-i` option to make sure `source ~/.bashrc` work
|
|
3632
3588
|
setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
|
|
3589
|
+
unset_ray_env_vars = ' && '.join(
|
|
3590
|
+
[f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
|
|
3591
|
+
setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
|
|
3633
3592
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
|
3634
3593
|
|
|
3635
3594
|
def _setup_node(node_id: int) -> None:
|
|
3636
|
-
setup_envs =
|
|
3595
|
+
setup_envs = task_lib.get_plaintext_envs_and_secrets(
|
|
3596
|
+
task.envs_and_secrets)
|
|
3637
3597
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
|
3638
3598
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
|
3639
3599
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
|
3600
|
+
setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
|
|
3601
|
+
self._get_num_gpus(task)))
|
|
3602
|
+
|
|
3640
3603
|
runner = runners[node_id]
|
|
3641
3604
|
setup_script = log_lib.make_task_bash_script(setup,
|
|
3642
3605
|
env_vars=setup_envs)
|
|
@@ -3664,7 +3627,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3664
3627
|
_dump_final_script(setup_script,
|
|
3665
3628
|
constants.PERSISTENT_SETUP_SCRIPT_PATH)
|
|
3666
3629
|
|
|
3667
|
-
if detach_setup or
|
|
3630
|
+
if (detach_setup or
|
|
3631
|
+
backend_utils.is_command_length_over_limit(encoded_script)):
|
|
3668
3632
|
_dump_final_script(setup_script)
|
|
3669
3633
|
create_script_code = 'true'
|
|
3670
3634
|
else:
|
|
@@ -3693,29 +3657,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3693
3657
|
|
|
3694
3658
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
|
3695
3659
|
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
encoding='utf-8') as f:
|
|
3701
|
-
return match_str.lower() in f.read().lower()
|
|
3702
|
-
except Exception as e: # pylint: disable=broad-except
|
|
3703
|
-
# We don't crash the setup if we cannot read the log file.
|
|
3704
|
-
# Instead, we should retry the setup with dumping the script
|
|
3705
|
-
# to a file to be safe.
|
|
3706
|
-
logger.debug(
|
|
3707
|
-
f'Failed to read setup log file {setup_log_path}: {e}')
|
|
3708
|
-
return True
|
|
3709
|
-
|
|
3710
|
-
if ((returncode == 255 and _load_setup_log_and_match('too long')) or
|
|
3711
|
-
(returncode == 1 and
|
|
3712
|
-
_load_setup_log_and_match('request-uri too large'))):
|
|
3713
|
-
# If the setup script is too long, we retry it with dumping
|
|
3714
|
-
# the script to a file and running it with SSH. We use a
|
|
3715
|
-
# general length limit check before but it could be
|
|
3716
|
-
# inaccurate on some systems.
|
|
3717
|
-
# When there is a cloudflare proxy in front of the remote, it
|
|
3718
|
-
# could cause `414 Request-URI Too Large` error.
|
|
3660
|
+
if _is_message_too_long(returncode, file_path=setup_log_path):
|
|
3661
|
+
# If the setup script is too long, we need to retry it
|
|
3662
|
+
# with dumping the script to a file and running it the script
|
|
3663
|
+
# on remote cluster instead.
|
|
3719
3664
|
logger.debug('Failed to run setup command inline due to '
|
|
3720
3665
|
'command length limit. Dumping setup script to '
|
|
3721
3666
|
'file and running it with SSH.')
|
|
@@ -3779,119 +3724,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3779
3724
|
logger.info(
|
|
3780
3725
|
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
|
3781
3726
|
|
|
3727
|
+
def _download_file(self, handle: CloudVmRayResourceHandle,
|
|
3728
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
3729
|
+
"""Syncs file from remote to local."""
|
|
3730
|
+
runners = handle.get_command_runners()
|
|
3731
|
+
head_runner = runners[0]
|
|
3732
|
+
head_runner.rsync(
|
|
3733
|
+
source=local_file_path,
|
|
3734
|
+
target=remote_file_path,
|
|
3735
|
+
up=False,
|
|
3736
|
+
stream_logs=False,
|
|
3737
|
+
)
|
|
3738
|
+
|
|
3782
3739
|
def _exec_code_on_head(
|
|
3783
3740
|
self,
|
|
3784
3741
|
handle: CloudVmRayResourceHandle,
|
|
3785
3742
|
codegen: str,
|
|
3786
3743
|
job_id: int,
|
|
3787
|
-
detach_run: bool = False,
|
|
3788
3744
|
managed_job_dag: Optional['dag.Dag'] = None,
|
|
3745
|
+
managed_job_user_id: Optional[str] = None,
|
|
3789
3746
|
remote_log_dir: Optional[str] = None,
|
|
3790
3747
|
) -> None:
|
|
3791
3748
|
"""Executes generated code on the head node."""
|
|
3792
|
-
|
|
3749
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3750
|
+
file_name = f'sky_job_{job_id}'
|
|
3751
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
3793
3752
|
if remote_log_dir is None:
|
|
3794
3753
|
remote_log_dir = self.log_dir
|
|
3795
3754
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3796
3755
|
|
|
3797
|
-
|
|
3756
|
+
def _dump_code_to_file(codegen: str,
|
|
3757
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3758
|
+
runners = handle.get_command_runners()
|
|
3759
|
+
head_runner = runners[0]
|
|
3760
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3761
|
+
fp.write(codegen)
|
|
3762
|
+
fp.flush()
|
|
3763
|
+
script_path = os.path.join(target_dir, file_name)
|
|
3764
|
+
# We choose to sync code + exec, because the alternative of
|
|
3765
|
+
# 'ray submit' may not work as it may use system python
|
|
3766
|
+
# (python2) to execute the script. Happens for AWS.
|
|
3767
|
+
head_runner.rsync(source=fp.name,
|
|
3768
|
+
target=script_path,
|
|
3769
|
+
up=True,
|
|
3770
|
+
stream_logs=False)
|
|
3798
3771
|
|
|
3772
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3799
3773
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3800
3774
|
f'touch {remote_log_path}')
|
|
3801
3775
|
encoded_script = shlex.quote(codegen)
|
|
3802
3776
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3803
3777
|
job_submit_cmd = (
|
|
3804
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3805
|
-
# with pid is the same driver process.
|
|
3778
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3779
|
+
# retrieved with pid is the same driver process.
|
|
3806
3780
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3807
3781
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3808
3782
|
# Do not use &>, which is not POSIX and may not work.
|
|
3809
3783
|
# Note that the order of ">filename 2>&1" matters.
|
|
3810
3784
|
f'> {remote_log_path} 2>&1')
|
|
3811
|
-
|
|
3812
3785
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3813
3786
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3814
3787
|
|
|
3815
|
-
|
|
3816
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3817
|
-
runners = handle.get_command_runners()
|
|
3818
|
-
head_runner = runners[0]
|
|
3819
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3820
|
-
fp.write(codegen)
|
|
3821
|
-
fp.flush()
|
|
3822
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3823
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3824
|
-
# submit' may not work as it may use system python (python2) to
|
|
3825
|
-
# execute the script. Happens for AWS.
|
|
3826
|
-
head_runner.rsync(source=fp.name,
|
|
3827
|
-
target=script_path,
|
|
3828
|
-
up=True,
|
|
3829
|
-
stream_logs=False)
|
|
3830
|
-
|
|
3831
|
-
# Should also be ealier than _is_command_length_over_limit
|
|
3788
|
+
# Should also be ealier than is_command_length_over_limit
|
|
3832
3789
|
# Same reason as in _setup
|
|
3833
3790
|
if self._dump_final_script:
|
|
3834
3791
|
_dump_code_to_file(job_submit_cmd,
|
|
3835
3792
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3836
3793
|
|
|
3837
|
-
if
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3794
|
+
if not use_legacy:
|
|
3795
|
+
try:
|
|
3796
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
3797
|
+
if managed_job_dag is not None:
|
|
3798
|
+
workspace = skypilot_config.get_active_workspace(
|
|
3799
|
+
force_user_workspace=True)
|
|
3800
|
+
entrypoint = common_utils.get_current_command()
|
|
3801
|
+
|
|
3802
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
3803
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
3804
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
3805
|
+
task, is_managed_job=True)
|
|
3806
|
+
managed_job_tasks.append(
|
|
3807
|
+
jobsv1_pb2.ManagedJobTask(
|
|
3808
|
+
task_id=task_id,
|
|
3809
|
+
name=task.name,
|
|
3810
|
+
resources_str=resources_str,
|
|
3811
|
+
metadata_json=task.metadata_json))
|
|
3812
|
+
|
|
3813
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
3814
|
+
name=managed_job_dag.name,
|
|
3815
|
+
pool=managed_job_dag.pool,
|
|
3816
|
+
workspace=workspace,
|
|
3817
|
+
entrypoint=entrypoint,
|
|
3818
|
+
tasks=managed_job_tasks,
|
|
3819
|
+
user_id=managed_job_user_id)
|
|
3820
|
+
|
|
3821
|
+
if backend_utils.is_command_length_over_limit(codegen):
|
|
3822
|
+
_dump_code_to_file(codegen)
|
|
3823
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3824
|
+
job_id=job_id,
|
|
3825
|
+
# codegen not set - server assumes script uploaded
|
|
3826
|
+
remote_log_dir=remote_log_dir,
|
|
3827
|
+
managed_job=managed_job_info,
|
|
3828
|
+
script_path=script_path)
|
|
3829
|
+
else:
|
|
3830
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3831
|
+
job_id=job_id,
|
|
3832
|
+
codegen=codegen,
|
|
3833
|
+
remote_log_dir=remote_log_dir,
|
|
3834
|
+
managed_job=managed_job_info,
|
|
3835
|
+
script_path=script_path)
|
|
3836
|
+
|
|
3837
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
3838
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
3839
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3840
|
+
use_legacy = True
|
|
3841
|
+
|
|
3842
|
+
if use_legacy:
|
|
3843
|
+
if backend_utils.is_command_length_over_limit(job_submit_cmd):
|
|
3844
|
+
_dump_code_to_file(codegen)
|
|
3845
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3846
|
+
|
|
3847
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
3848
|
+
if managed_job_dag is not None:
|
|
3849
|
+
# Add the managed job to job queue database.
|
|
3850
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
3851
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
3852
|
+
job_id,
|
|
3853
|
+
managed_job_dag,
|
|
3854
|
+
skypilot_config.get_active_workspace(
|
|
3855
|
+
force_user_workspace=True),
|
|
3856
|
+
entrypoint=common_utils.get_current_command(),
|
|
3857
|
+
user_hash=managed_job_user_id)
|
|
3858
|
+
# Set the managed job to PENDING state to make sure that
|
|
3859
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
3860
|
+
# if it needs to wait to be submitted.
|
|
3861
|
+
# We cannot set the managed job to PENDING state in the
|
|
3862
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
3863
|
+
# to wait for the run commands to be scheduled on the job
|
|
3864
|
+
# controller in high-load cases.
|
|
3865
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
3866
|
+
return job_submit_cmd
|
|
3862
3867
|
|
|
3863
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3864
|
-
job_submit_cmd,
|
|
3865
|
-
stream_logs=False,
|
|
3866
|
-
require_outputs=True)
|
|
3867
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3868
|
-
# running a job. Necessitating calling `sky launch`.
|
|
3869
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3870
|
-
handle.cluster_name)
|
|
3871
|
-
output = stdout + stderr
|
|
3872
|
-
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
3873
|
-
(returncode == 1 and 'request-uri too large' in output.lower())):
|
|
3874
|
-
# If the generated script is too long, we retry it with dumping
|
|
3875
|
-
# the script to a file and running it with SSH. We use a general
|
|
3876
|
-
# length limit check before but it could be inaccurate on some
|
|
3877
|
-
# systems.
|
|
3878
|
-
# When there is a cloudflare proxy in front of the remote, it could
|
|
3879
|
-
# cause `414 Request-URI Too Large` error.
|
|
3880
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
3881
|
-
'Dumping job to file and running it with SSH. '
|
|
3882
|
-
f'Output: {output}')
|
|
3883
|
-
_dump_code_to_file(codegen)
|
|
3884
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3885
3868
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3869
|
+
|
|
3886
3870
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3887
3871
|
job_submit_cmd,
|
|
3888
3872
|
stream_logs=False,
|
|
3889
3873
|
require_outputs=True)
|
|
3874
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3875
|
+
# running a job. Necessitating calling `sky launch`.
|
|
3876
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3877
|
+
handle.cluster_name)
|
|
3878
|
+
output = stdout + stderr
|
|
3879
|
+
if _is_message_too_long(returncode, output=output):
|
|
3880
|
+
# If the job submit script is too long, we need to retry it
|
|
3881
|
+
# with dumping the script to a file and running it the script
|
|
3882
|
+
# on remote cluster instead.
|
|
3883
|
+
logger.debug(
|
|
3884
|
+
'Failed to submit job due to command length limit. '
|
|
3885
|
+
'Dumping job to file and running it with SSH. '
|
|
3886
|
+
f'Output: {output}')
|
|
3887
|
+
_dump_code_to_file(codegen)
|
|
3888
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3889
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3890
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3891
|
+
handle,
|
|
3892
|
+
job_submit_cmd,
|
|
3893
|
+
stream_logs=False,
|
|
3894
|
+
require_outputs=True)
|
|
3890
3895
|
|
|
3891
|
-
|
|
3892
|
-
|
|
3893
|
-
|
|
3894
|
-
|
|
3896
|
+
subprocess_utils.handle_returncode(
|
|
3897
|
+
returncode,
|
|
3898
|
+
job_submit_cmd,
|
|
3899
|
+
f'Failed to submit job {job_id}.',
|
|
3900
|
+
stderr=stdout + stderr)
|
|
3895
3901
|
|
|
3896
3902
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
3897
3903
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -3900,61 +3906,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3900
3906
|
logger.info(
|
|
3901
3907
|
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
|
3902
3908
|
rich_utils.stop_safe_status()
|
|
3903
|
-
if not detach_run:
|
|
3904
|
-
if (handle.cluster_name == controller_utils.Controllers.
|
|
3905
|
-
JOBS_CONTROLLER.value.cluster_name):
|
|
3906
|
-
self.tail_managed_job_logs(handle, job_id)
|
|
3907
|
-
else:
|
|
3908
|
-
# Sky logs. Not using subprocess.run since it will make the
|
|
3909
|
-
# ssh keep connected after ctrl-c.
|
|
3910
|
-
self.tail_logs(handle, job_id)
|
|
3911
3909
|
|
|
3912
3910
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
3913
3911
|
job_name: Optional[str], resources_str: str,
|
|
3914
3912
|
metadata: str) -> Tuple[int, str]:
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
|
|
3932
|
-
|
|
3933
|
-
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
3913
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3914
|
+
|
|
3915
|
+
if not use_legacy:
|
|
3916
|
+
try:
|
|
3917
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
3918
|
+
job_name=job_name,
|
|
3919
|
+
username=common_utils.get_user_hash(),
|
|
3920
|
+
run_timestamp=self.run_timestamp,
|
|
3921
|
+
resources_str=resources_str,
|
|
3922
|
+
metadata=metadata)
|
|
3923
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
3924
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
3925
|
+
request))
|
|
3926
|
+
job_id = response.job_id
|
|
3927
|
+
log_dir = response.log_dir
|
|
3928
|
+
return job_id, log_dir
|
|
3929
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3930
|
+
use_legacy = True
|
|
3931
|
+
|
|
3932
|
+
if use_legacy:
|
|
3933
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
3934
|
+
job_name=job_name,
|
|
3935
|
+
username=common_utils.get_user_hash(),
|
|
3936
|
+
run_timestamp=self.run_timestamp,
|
|
3937
|
+
resources_str=resources_str,
|
|
3938
|
+
metadata=metadata)
|
|
3939
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
3940
|
+
handle,
|
|
3941
|
+
code,
|
|
3942
|
+
stream_logs=False,
|
|
3943
|
+
require_outputs=True,
|
|
3944
|
+
separate_stderr=True)
|
|
3945
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3946
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
3947
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3948
|
+
handle.cluster_name)
|
|
3949
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
3950
|
+
# retry for this, after we figure out the reason.
|
|
3951
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
3952
|
+
'Failed to fetch job id.',
|
|
3953
|
+
stderr)
|
|
3954
|
+
try:
|
|
3955
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
3956
|
+
if job_id_match is not None:
|
|
3957
|
+
job_id = int(job_id_match.group(1))
|
|
3958
|
+
else:
|
|
3959
|
+
# For backward compatibility.
|
|
3960
|
+
job_id = int(result_str)
|
|
3961
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
3962
|
+
if log_dir_match is not None:
|
|
3963
|
+
log_dir = log_dir_match.group(1).strip()
|
|
3964
|
+
else:
|
|
3965
|
+
# For backward compatibility, use the same log dir as local.
|
|
3966
|
+
log_dir = self.log_dir
|
|
3967
|
+
except ValueError as e:
|
|
3968
|
+
logger.error(stderr)
|
|
3969
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
3970
|
+
f'Returncode: {returncode}') from e
|
|
3951
3971
|
return job_id, log_dir
|
|
3952
3972
|
|
|
3953
3973
|
def _execute(
|
|
3954
3974
|
self,
|
|
3955
3975
|
handle: CloudVmRayResourceHandle,
|
|
3956
3976
|
task: task_lib.Task,
|
|
3957
|
-
detach_run: bool,
|
|
3958
3977
|
dryrun: bool = False,
|
|
3959
3978
|
) -> Optional[int]:
|
|
3960
3979
|
"""Executes the task on the cluster.
|
|
@@ -4006,12 +4025,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4006
4025
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
4007
4026
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
|
4008
4027
|
if num_actual_nodes > 1:
|
|
4009
|
-
self._execute_task_n_nodes(handle, task_copy, job_id,
|
|
4010
|
-
log_dir)
|
|
4028
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
|
|
4011
4029
|
else:
|
|
4012
4030
|
# Case: task_lib.Task(run, num_nodes=1)
|
|
4013
|
-
self._execute_task_one_node(handle, task_copy, job_id,
|
|
4014
|
-
log_dir)
|
|
4031
|
+
self._execute_task_one_node(handle, task_copy, job_id, log_dir)
|
|
4015
4032
|
|
|
4016
4033
|
return job_id
|
|
4017
4034
|
|
|
@@ -4054,7 +4071,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4054
4071
|
is_identity_mismatch_and_purge = False
|
|
4055
4072
|
try:
|
|
4056
4073
|
backend_utils.check_owner_identity(cluster_name)
|
|
4057
|
-
except exceptions.ClusterOwnerIdentityMismatchError
|
|
4074
|
+
except (exceptions.ClusterOwnerIdentityMismatchError,
|
|
4075
|
+
exceptions.CloudUserIdentityError) as e:
|
|
4058
4076
|
if purge:
|
|
4059
4077
|
logger.error(e)
|
|
4060
4078
|
verbed = 'terminated' if terminate else 'stopped'
|
|
@@ -4068,15 +4086,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4068
4086
|
else:
|
|
4069
4087
|
raise
|
|
4070
4088
|
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
4071
|
-
lock = locks.get_lock(lock_id)
|
|
4089
|
+
lock = locks.get_lock(lock_id, timeout=1)
|
|
4072
4090
|
# Retry in case new cluster operation comes in and holds the lock
|
|
4073
4091
|
# right after the lock is removed.
|
|
4074
4092
|
n_attempts = 2
|
|
4075
4093
|
while True:
|
|
4076
4094
|
n_attempts -= 1
|
|
4077
|
-
# In case other running cluster operations are still holding the
|
|
4078
|
-
# lock.
|
|
4079
|
-
lock.force_unlock()
|
|
4080
4095
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
4081
4096
|
# should be higher priority than the cluster requests, and we should
|
|
4082
4097
|
# release the lock from other requests.
|
|
@@ -4094,6 +4109,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4094
4109
|
'Failed to kill other launch requests for the '
|
|
4095
4110
|
f'cluster {handle.cluster_name}: '
|
|
4096
4111
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4112
|
+
# In case other running cluster operations are still holding the
|
|
4113
|
+
# lock.
|
|
4114
|
+
lock.force_unlock()
|
|
4097
4115
|
try:
|
|
4098
4116
|
with lock:
|
|
4099
4117
|
self.teardown_no_lock(
|
|
@@ -4126,6 +4144,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4126
4144
|
job_ids: Optional[List[int]] = None,
|
|
4127
4145
|
stream_logs: bool = True
|
|
4128
4146
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4147
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4148
|
+
try:
|
|
4149
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4150
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4151
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4152
|
+
).get_job_status(request))
|
|
4153
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4154
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4155
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4156
|
+
}
|
|
4157
|
+
return statuses
|
|
4158
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4159
|
+
pass
|
|
4160
|
+
|
|
4129
4161
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
4130
4162
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4131
4163
|
code,
|
|
@@ -4146,16 +4178,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4146
4178
|
|
|
4147
4179
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
4148
4180
|
"""
|
|
4149
|
-
|
|
4150
|
-
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4155
|
-
|
|
4156
|
-
|
|
4157
|
-
|
|
4158
|
-
|
|
4181
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4182
|
+
|
|
4183
|
+
if not use_legacy:
|
|
4184
|
+
try:
|
|
4185
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4186
|
+
cancel_all=cancel_all,
|
|
4187
|
+
user_hash=user_hash)
|
|
4188
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4189
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4190
|
+
request))
|
|
4191
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4192
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4193
|
+
use_legacy = True
|
|
4194
|
+
|
|
4195
|
+
if use_legacy:
|
|
4196
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4197
|
+
user_hash)
|
|
4198
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4199
|
+
code,
|
|
4200
|
+
stream_logs=False,
|
|
4201
|
+
require_outputs=True)
|
|
4202
|
+
subprocess_utils.handle_returncode(
|
|
4203
|
+
returncode, code,
|
|
4204
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4205
|
+
stdout)
|
|
4206
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
4159
4207
|
if cancelled_ids:
|
|
4160
4208
|
logger.info(
|
|
4161
4209
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -4172,20 +4220,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4172
4220
|
Returns:
|
|
4173
4221
|
A dictionary mapping job_id to log path.
|
|
4174
4222
|
"""
|
|
4175
|
-
|
|
4176
|
-
|
|
4223
|
+
job_to_dir: Dict[str, str] = {}
|
|
4224
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4225
|
+
|
|
4226
|
+
if not use_legacy:
|
|
4227
|
+
try:
|
|
4228
|
+
int_job_ids = []
|
|
4229
|
+
if job_ids:
|
|
4230
|
+
for str_job_id in job_ids:
|
|
4231
|
+
if str_job_id.isdigit():
|
|
4232
|
+
int_job_ids.append(int(str_job_id))
|
|
4233
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4234
|
+
job_ids=int_job_ids)
|
|
4235
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4236
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4237
|
+
).get_log_dirs_for_jobs(request))
|
|
4238
|
+
job_log_dirs = response.job_log_dirs
|
|
4239
|
+
if not job_log_dirs:
|
|
4240
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4241
|
+
'No matching log directories found'
|
|
4242
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4243
|
+
return {}
|
|
4244
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4245
|
+
# Convert to string for backwards compatibility
|
|
4246
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4247
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4248
|
+
use_legacy = True
|
|
4249
|
+
|
|
4250
|
+
if use_legacy:
|
|
4251
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4252
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4177
4253
|
code,
|
|
4178
4254
|
stream_logs=False,
|
|
4179
4255
|
require_outputs=True,
|
|
4180
4256
|
separate_stderr=True)
|
|
4181
|
-
|
|
4182
|
-
|
|
4183
|
-
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
|
|
4187
|
-
|
|
4188
|
-
|
|
4257
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4258
|
+
'Failed to sync logs.', stderr)
|
|
4259
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4260
|
+
if not job_to_dir:
|
|
4261
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4262
|
+
'No matching log directories found'
|
|
4263
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4264
|
+
return {}
|
|
4189
4265
|
|
|
4190
4266
|
job_ids = list(job_to_dir.keys())
|
|
4191
4267
|
dirs = list(job_to_dir.values())
|
|
@@ -4195,9 +4271,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4195
4271
|
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
|
4196
4272
|
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
|
4197
4273
|
]
|
|
4198
|
-
|
|
4199
|
-
|
|
4200
|
-
|
|
4274
|
+
# Include cluster name in local log directory path to avoid conflicts
|
|
4275
|
+
# when the same job_id exists on different clusters
|
|
4276
|
+
cluster_name = handle.cluster_name
|
|
4277
|
+
local_log_dirs = []
|
|
4278
|
+
for remote_log_dir in dirs:
|
|
4279
|
+
if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
|
|
4280
|
+
# Extract the job-specific directory name from the full path
|
|
4281
|
+
# e.g., ~/sky_logs/1-job_name -> 1-job_name
|
|
4282
|
+
job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
|
|
4283
|
+
'').lstrip('/')
|
|
4284
|
+
local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
|
|
4285
|
+
else:
|
|
4286
|
+
# remote_log_dir is already just the job directory name (e.g.,
|
|
4287
|
+
# "1-job_name")
|
|
4288
|
+
local_log_dir = os.path.join(local_dir, cluster_name,
|
|
4289
|
+
remote_log_dir)
|
|
4290
|
+
local_log_dirs.append(local_log_dir)
|
|
4201
4291
|
|
|
4202
4292
|
runners = handle.get_command_runners()
|
|
4203
4293
|
|
|
@@ -4261,6 +4351,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4261
4351
|
The exit code of the tail command. Returns code 100 if the job has
|
|
4262
4352
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
4263
4353
|
"""
|
|
4354
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4355
|
+
last_exit_code = 0
|
|
4356
|
+
try:
|
|
4357
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4358
|
+
job_id=job_id,
|
|
4359
|
+
managed_job_id=managed_job_id,
|
|
4360
|
+
follow=follow,
|
|
4361
|
+
tail=tail)
|
|
4362
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4363
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4364
|
+
).tail_logs(request, timeout=None)):
|
|
4365
|
+
if resp.log_line:
|
|
4366
|
+
print(resp.log_line, end='', flush=True)
|
|
4367
|
+
last_exit_code = resp.exit_code
|
|
4368
|
+
return last_exit_code
|
|
4369
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4370
|
+
pass
|
|
4371
|
+
except grpc.RpcError as e:
|
|
4372
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4373
|
+
return last_exit_code
|
|
4374
|
+
raise e
|
|
4375
|
+
|
|
4264
4376
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
4265
4377
|
managed_job_id=managed_job_id,
|
|
4266
4378
|
follow=follow,
|
|
@@ -4298,6 +4410,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4298
4410
|
tail: Optional[int] = None) -> int:
|
|
4299
4411
|
# if job_name is not None, job_id should be None
|
|
4300
4412
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4413
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4301
4414
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4302
4415
|
job_name, job_id, follow, controller, tail)
|
|
4303
4416
|
|
|
@@ -4343,20 +4456,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4343
4456
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4344
4457
|
|
|
4345
4458
|
if job_id is None:
|
|
4346
|
-
#
|
|
4459
|
+
# get the job_id
|
|
4347
4460
|
# if job_name is None, get all job_ids
|
|
4348
4461
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
4349
|
-
|
|
4350
|
-
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
|
|
4354
|
-
|
|
4355
|
-
|
|
4356
|
-
|
|
4357
|
-
|
|
4358
|
-
|
|
4359
|
-
|
|
4462
|
+
|
|
4463
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4464
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4465
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4466
|
+
if not use_legacy:
|
|
4467
|
+
try:
|
|
4468
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4469
|
+
job_name=job_name)
|
|
4470
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4471
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4472
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4473
|
+
job_ids = list(response.job_ids)
|
|
4474
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4475
|
+
use_legacy = True
|
|
4476
|
+
|
|
4477
|
+
if use_legacy:
|
|
4478
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4479
|
+
job_name=job_name)
|
|
4480
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4481
|
+
handle,
|
|
4482
|
+
code,
|
|
4483
|
+
stream_logs=False,
|
|
4484
|
+
require_outputs=True,
|
|
4485
|
+
separate_stderr=True)
|
|
4486
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4487
|
+
'Failed to sync down logs.',
|
|
4488
|
+
stderr)
|
|
4489
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
4360
4490
|
if not job_ids:
|
|
4361
4491
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4362
4492
|
'No matching job found'
|
|
@@ -4384,18 +4514,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4384
4514
|
else:
|
|
4385
4515
|
# get the run_timestamp
|
|
4386
4516
|
# the function takes in [job_id]
|
|
4387
|
-
|
|
4388
|
-
|
|
4389
|
-
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4395
|
-
|
|
4396
|
-
|
|
4397
|
-
|
|
4398
|
-
|
|
4517
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4518
|
+
if not use_legacy:
|
|
4519
|
+
try:
|
|
4520
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4521
|
+
job_ids=[job_id])
|
|
4522
|
+
log_dirs_response = (
|
|
4523
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4524
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4525
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4526
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4527
|
+
# Convert back to the expected format
|
|
4528
|
+
# {job_id: run_timestamp}
|
|
4529
|
+
run_timestamps = {}
|
|
4530
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4531
|
+
run_timestamps[int(jid)] = log_dir
|
|
4532
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4533
|
+
use_legacy = True
|
|
4534
|
+
|
|
4535
|
+
if use_legacy:
|
|
4536
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4537
|
+
[str(job_id)])
|
|
4538
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4539
|
+
handle,
|
|
4540
|
+
code,
|
|
4541
|
+
stream_logs=False,
|
|
4542
|
+
require_outputs=True,
|
|
4543
|
+
separate_stderr=True)
|
|
4544
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4545
|
+
'Failed to sync logs.',
|
|
4546
|
+
stderr)
|
|
4547
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4548
|
+
run_timestamps = message_utils.decode_payload(
|
|
4549
|
+
run_timestamps_payload)
|
|
4399
4550
|
if not run_timestamps:
|
|
4400
4551
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4401
4552
|
'No matching log directories found'
|
|
@@ -4462,11 +4613,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4462
4613
|
exist_ok=True)
|
|
4463
4614
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4464
4615
|
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4616
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4617
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4618
|
+
job_name=None,
|
|
4619
|
+
job_id=int(job_id),
|
|
4620
|
+
follow=False,
|
|
4621
|
+
controller=False)
|
|
4470
4622
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4471
4623
|
# kill the process, so we need to handle it manually here.
|
|
4472
4624
|
if threading.current_thread() is threading.main_thread():
|
|
@@ -4507,6 +4659,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4507
4659
|
Raises:
|
|
4508
4660
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
|
4509
4661
|
"""
|
|
4662
|
+
try:
|
|
4663
|
+
handle.close_skylet_ssh_tunnel()
|
|
4664
|
+
except Exception as e: # pylint: disable=broad-except
|
|
4665
|
+
# Not critical to the cluster teardown, just log a warning.
|
|
4666
|
+
logger.warning(
|
|
4667
|
+
'Failed to close Skylet SSH tunnel for cluster '
|
|
4668
|
+
f'{handle.cluster_name}: '
|
|
4669
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4670
|
+
|
|
4510
4671
|
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
|
4511
4672
|
# We have to kill the cluster requests again within the lock, because
|
|
4512
4673
|
# any pending requests on the same cluster should be cancelled after
|
|
@@ -4543,7 +4704,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4543
4704
|
# observed in AWS. See also
|
|
4544
4705
|
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
4545
4706
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
4546
|
-
|
|
4707
|
+
cluster_lock_already_held=True,
|
|
4708
|
+
retry_if_missing=False))
|
|
4547
4709
|
cluster_status_fetched = True
|
|
4548
4710
|
except exceptions.ClusterStatusFetchingError:
|
|
4549
4711
|
logger.warning(
|
|
@@ -4551,10 +4713,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4551
4713
|
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
|
4552
4714
|
'up.')
|
|
4553
4715
|
if not cluster_status_fetched:
|
|
4554
|
-
|
|
4716
|
+
status = global_user_state.get_status_from_cluster_name(
|
|
4555
4717
|
handle.cluster_name)
|
|
4556
|
-
prev_cluster_status =
|
|
4557
|
-
'status'] if record is not None else None
|
|
4718
|
+
prev_cluster_status = status if status is not None else None
|
|
4558
4719
|
if prev_cluster_status is None:
|
|
4559
4720
|
# When the cluster is not in the cluster table, we guarantee that
|
|
4560
4721
|
# all related resources / cache / config are cleaned up, i.e. it
|
|
@@ -4786,7 +4947,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4786
4947
|
config['provider'])
|
|
4787
4948
|
ports_cleaned_up = True
|
|
4788
4949
|
except exceptions.NotSupportedError:
|
|
4789
|
-
|
|
4950
|
+
ports_cleaned_up = True
|
|
4790
4951
|
except exceptions.PortDoesNotExistError:
|
|
4791
4952
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
4792
4953
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -4811,7 +4972,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4811
4972
|
failover)
|
|
4812
4973
|
custom_multi_network_cleaned_up = True
|
|
4813
4974
|
except exceptions.NotSupportedError:
|
|
4814
|
-
|
|
4975
|
+
custom_multi_network_cleaned_up = True
|
|
4815
4976
|
except Exception as e: # pylint: disable=broad-except
|
|
4816
4977
|
if purge:
|
|
4817
4978
|
msg = common_utils.format_exception(e, use_bracket=True)
|
|
@@ -4913,7 +5074,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4913
5074
|
cluster_yaml_path = handle.cluster_yaml
|
|
4914
5075
|
handle.cluster_yaml = None
|
|
4915
5076
|
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
|
4916
|
-
|
|
5077
|
+
# Removing the cluster YAML can cause some unexpected stability issues.
|
|
5078
|
+
# See #5011.
|
|
5079
|
+
# global_user_state.remove_cluster_yaml(handle.cluster_name)
|
|
4917
5080
|
common_utils.remove_file_if_exists(cluster_yaml_path)
|
|
4918
5081
|
|
|
4919
5082
|
def set_autostop(self,
|
|
@@ -4974,9 +5137,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4974
5137
|
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4975
5138
|
down=down,
|
|
4976
5139
|
)
|
|
4977
|
-
backend_utils.invoke_skylet_with_retries(
|
|
4978
|
-
handle
|
|
4979
|
-
set_autostop(request))
|
|
5140
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5141
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
4980
5142
|
else:
|
|
4981
5143
|
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4982
5144
|
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
@@ -5015,8 +5177,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5015
5177
|
try:
|
|
5016
5178
|
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5017
5179
|
response = backend_utils.invoke_skylet_with_retries(
|
|
5018
|
-
|
|
5019
|
-
|
|
5180
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5181
|
+
).is_autostopping(request))
|
|
5020
5182
|
return response.is_autostopping
|
|
5021
5183
|
except Exception as e: # pylint: disable=broad-except
|
|
5022
5184
|
# The cluster may have been terminated, causing the gRPC call
|
|
@@ -5128,7 +5290,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5128
5290
|
exceptions.InvalidClusterNameError: If the cluster name is invalid.
|
|
5129
5291
|
# TODO(zhwu): complete the list of exceptions.
|
|
5130
5292
|
"""
|
|
5131
|
-
record = global_user_state.get_cluster_from_name(
|
|
5293
|
+
record = global_user_state.get_cluster_from_name(
|
|
5294
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
5132
5295
|
if record is None:
|
|
5133
5296
|
handle_before_refresh = None
|
|
5134
5297
|
status_before_refresh = None
|
|
@@ -5148,7 +5311,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5148
5311
|
record = backend_utils.refresh_cluster_record(
|
|
5149
5312
|
cluster_name,
|
|
5150
5313
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5151
|
-
|
|
5314
|
+
cluster_lock_already_held=True,
|
|
5315
|
+
include_user_info=False,
|
|
5316
|
+
summary_response=True,
|
|
5152
5317
|
)
|
|
5153
5318
|
if record is not None:
|
|
5154
5319
|
prev_cluster_status = record['status']
|
|
@@ -5264,33 +5429,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5264
5429
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
|
5265
5430
|
|
|
5266
5431
|
if to_provision is None:
|
|
5267
|
-
#
|
|
5268
|
-
#
|
|
5269
|
-
#
|
|
5270
|
-
#
|
|
5271
|
-
#
|
|
5272
|
-
#
|
|
5273
|
-
#
|
|
5274
|
-
|
|
5275
|
-
|
|
5276
|
-
|
|
5277
|
-
|
|
5278
|
-
handle_before_refresh,
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
|
|
5283
|
-
|
|
5284
|
-
|
|
5285
|
-
|
|
5286
|
-
|
|
5287
|
-
|
|
5288
|
-
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
|
|
5293
|
-
|
|
5432
|
+
# Recently terminated after refresh. OPTIMIZE usually ran outside
|
|
5433
|
+
# the lock, so that decision may be stale by now. Under the lock,
|
|
5434
|
+
# ensure we always have a concrete plan via the following order:
|
|
5435
|
+
# 1) Reuse last placement snapshot (if available);
|
|
5436
|
+
# 2) Else, call injected planner for a fresh plan.
|
|
5437
|
+
# If we still have a pre-refresh handle snapshot with a concrete
|
|
5438
|
+
# placement, prefer reusing it.
|
|
5439
|
+
if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
|
|
5440
|
+
handle_before_refresh.launched_resources is not None):
|
|
5441
|
+
to_provision = handle_before_refresh.launched_resources
|
|
5442
|
+
# Ensure the requested task fits the previous placement.
|
|
5443
|
+
self.check_resources_fit_cluster(handle_before_refresh, task)
|
|
5444
|
+
# Mirror the original message for reuse path.
|
|
5445
|
+
status_before_refresh_str = None
|
|
5446
|
+
if status_before_refresh is not None:
|
|
5447
|
+
status_before_refresh_str = status_before_refresh.value
|
|
5448
|
+
logger.info(
|
|
5449
|
+
f'The cluster {cluster_name!r} (status: '
|
|
5450
|
+
f'{status_before_refresh_str}) was not found on the cloud: '
|
|
5451
|
+
'it may be autodowned, manually terminated, or its launch '
|
|
5452
|
+
'never succeeded. Provisioning a new cluster by using the '
|
|
5453
|
+
'same resources as its original launch.')
|
|
5454
|
+
elif self._planner is not None:
|
|
5455
|
+
to_provision = self._planner(task)
|
|
5456
|
+
logger.info(
|
|
5457
|
+
'Previous placement snapshot missing; computing a fresh '
|
|
5458
|
+
'plan for provisioning.')
|
|
5459
|
+
else:
|
|
5460
|
+
# Without a snapshot or planner, we cannot proceed safely.
|
|
5461
|
+
# Surface a user-friendly error without a long traceback.
|
|
5462
|
+
with ux_utils.print_exception_no_traceback():
|
|
5463
|
+
raise RuntimeError(
|
|
5464
|
+
'No concrete launch plan available after recent cloud '
|
|
5465
|
+
f'termination of cluster {cluster_name!r}. Ensure the '
|
|
5466
|
+
'OPTIMIZE stage runs or provide concrete resources.')
|
|
5294
5467
|
|
|
5295
5468
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
5296
5469
|
cluster_name,
|
|
@@ -5639,7 +5812,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5639
5812
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
|
5640
5813
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
|
5641
5814
|
"""Returns the environment variables for the task."""
|
|
5642
|
-
env_vars =
|
|
5815
|
+
env_vars = task_lib.get_plaintext_envs_and_secrets(
|
|
5816
|
+
task.envs_and_secrets)
|
|
5643
5817
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
|
5644
5818
|
# by the controller.
|
|
5645
5819
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|
|
@@ -5651,9 +5825,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5651
5825
|
env_vars.update(self._skypilot_predefined_env_vars(handle))
|
|
5652
5826
|
return env_vars
|
|
5653
5827
|
|
|
5828
|
+
def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
|
|
5829
|
+
"""Returns the user id for the managed job."""
|
|
5830
|
+
if task.managed_job_dag is not None:
|
|
5831
|
+
return task.envs[constants.USER_ID_ENV_VAR]
|
|
5832
|
+
return None
|
|
5833
|
+
|
|
5834
|
+
def _get_task_codegen_class(
|
|
5835
|
+
self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
|
|
5836
|
+
"""Returns the appropriate TaskCodeGen for the given handle."""
|
|
5837
|
+
if isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
5838
|
+
assert (handle.cached_cluster_info
|
|
5839
|
+
is not None), ('cached_cluster_info must be set')
|
|
5840
|
+
head_instance = handle.cached_cluster_info.get_head_instance()
|
|
5841
|
+
assert (head_instance is not None), (
|
|
5842
|
+
'Head instance not found in cached cluster info')
|
|
5843
|
+
slurm_job_id = head_instance.tags.get('job_id')
|
|
5844
|
+
assert (slurm_job_id
|
|
5845
|
+
is not None), ('job_id tag not found in head instance')
|
|
5846
|
+
return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
|
|
5847
|
+
else:
|
|
5848
|
+
return task_codegen.RayCodeGen()
|
|
5849
|
+
|
|
5654
5850
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5655
5851
|
task: task_lib.Task, job_id: int,
|
|
5656
|
-
|
|
5852
|
+
remote_log_dir: str) -> None:
|
|
5657
5853
|
# Launch the command as a Ray task.
|
|
5658
5854
|
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5659
5855
|
|
|
@@ -5663,42 +5859,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5663
5859
|
|
|
5664
5860
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5665
5861
|
|
|
5666
|
-
codegen =
|
|
5862
|
+
codegen = self._get_task_codegen_class(handle)
|
|
5863
|
+
|
|
5667
5864
|
codegen.add_prologue(job_id)
|
|
5668
|
-
codegen.
|
|
5865
|
+
codegen.add_setup(
|
|
5669
5866
|
1,
|
|
5670
5867
|
resources_dict,
|
|
5671
5868
|
stable_cluster_internal_ips=internal_ips,
|
|
5672
5869
|
env_vars=task_env_vars,
|
|
5870
|
+
log_dir=log_dir,
|
|
5673
5871
|
setup_cmd=self._setup_cmd,
|
|
5674
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5675
5872
|
)
|
|
5676
5873
|
|
|
5677
|
-
|
|
5678
|
-
|
|
5679
|
-
|
|
5680
|
-
codegen.register_run_fn(run_fn_code, run_fn_name)
|
|
5681
|
-
|
|
5682
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5683
|
-
codegen.add_ray_task(
|
|
5684
|
-
bash_script=command_for_node,
|
|
5874
|
+
codegen.add_task(
|
|
5875
|
+
1,
|
|
5876
|
+
bash_script=task.run,
|
|
5685
5877
|
env_vars=task_env_vars,
|
|
5686
5878
|
task_name=task.name,
|
|
5687
|
-
|
|
5879
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5688
5880
|
log_dir=log_dir)
|
|
5689
5881
|
|
|
5690
5882
|
codegen.add_epilogue()
|
|
5691
5883
|
|
|
5692
|
-
self._exec_code_on_head(
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5884
|
+
self._exec_code_on_head(
|
|
5885
|
+
handle,
|
|
5886
|
+
codegen.build(),
|
|
5887
|
+
job_id,
|
|
5888
|
+
managed_job_dag=task.managed_job_dag,
|
|
5889
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5890
|
+
remote_log_dir=remote_log_dir)
|
|
5698
5891
|
|
|
5699
5892
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
|
5700
5893
|
task: task_lib.Task, job_id: int,
|
|
5701
|
-
|
|
5894
|
+
remote_log_dir: str) -> None:
|
|
5702
5895
|
# Strategy:
|
|
5703
5896
|
# ray.init(...)
|
|
5704
5897
|
# for node:
|
|
@@ -5712,42 +5905,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5712
5905
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
5713
5906
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5714
5907
|
|
|
5715
|
-
codegen =
|
|
5908
|
+
codegen = self._get_task_codegen_class(handle)
|
|
5909
|
+
|
|
5716
5910
|
codegen.add_prologue(job_id)
|
|
5717
|
-
codegen.
|
|
5911
|
+
codegen.add_setup(
|
|
5718
5912
|
num_actual_nodes,
|
|
5719
5913
|
resources_dict,
|
|
5720
5914
|
stable_cluster_internal_ips=internal_ips,
|
|
5721
5915
|
env_vars=task_env_vars,
|
|
5916
|
+
log_dir=log_dir,
|
|
5722
5917
|
setup_cmd=self._setup_cmd,
|
|
5723
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5724
5918
|
)
|
|
5725
5919
|
|
|
5726
|
-
|
|
5727
|
-
|
|
5728
|
-
|
|
5729
|
-
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
|
|
5733
|
-
for i in range(num_actual_nodes):
|
|
5734
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5735
|
-
|
|
5736
|
-
# Ray's per-node resources, to constrain scheduling each command to
|
|
5737
|
-
# the corresponding node, represented by private IPs.
|
|
5738
|
-
codegen.add_ray_task(
|
|
5739
|
-
bash_script=command_for_node,
|
|
5740
|
-
env_vars=task_env_vars,
|
|
5741
|
-
task_name=task.name,
|
|
5742
|
-
ray_resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5743
|
-
log_dir=log_dir,
|
|
5744
|
-
gang_scheduling_id=i)
|
|
5920
|
+
codegen.add_task(
|
|
5921
|
+
num_actual_nodes,
|
|
5922
|
+
bash_script=task.run,
|
|
5923
|
+
env_vars=task_env_vars,
|
|
5924
|
+
task_name=task.name,
|
|
5925
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5926
|
+
log_dir=log_dir)
|
|
5745
5927
|
|
|
5746
5928
|
codegen.add_epilogue()
|
|
5747
5929
|
# TODO(zhanghao): Add help info for downloading logs.
|
|
5748
|
-
self._exec_code_on_head(
|
|
5749
|
-
|
|
5750
|
-
|
|
5751
|
-
|
|
5752
|
-
|
|
5753
|
-
|
|
5930
|
+
self._exec_code_on_head(
|
|
5931
|
+
handle,
|
|
5932
|
+
codegen.build(),
|
|
5933
|
+
job_id,
|
|
5934
|
+
managed_job_dag=task.managed_job_dag,
|
|
5935
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5936
|
+
remote_log_dir=remote_log_dir)
|