skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,14 +2,15 @@
|
|
|
2
2
|
import copy
|
|
3
3
|
import dataclasses
|
|
4
4
|
import enum
|
|
5
|
-
import inspect
|
|
6
5
|
import json
|
|
7
6
|
import math
|
|
8
7
|
import os
|
|
9
8
|
import pathlib
|
|
9
|
+
import random
|
|
10
10
|
import re
|
|
11
11
|
import shlex
|
|
12
12
|
import signal
|
|
13
|
+
import socket
|
|
13
14
|
import subprocess
|
|
14
15
|
import sys
|
|
15
16
|
import tempfile
|
|
@@ -17,8 +18,8 @@ import textwrap
|
|
|
17
18
|
import threading
|
|
18
19
|
import time
|
|
19
20
|
import typing
|
|
20
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
21
|
-
Union)
|
|
21
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
22
|
+
Set, Tuple, Union)
|
|
22
23
|
|
|
23
24
|
import colorama
|
|
24
25
|
import psutil
|
|
@@ -39,6 +40,7 @@ from sky import skypilot_config
|
|
|
39
40
|
from sky import task as task_lib
|
|
40
41
|
from sky.adaptors import common as adaptors_common
|
|
41
42
|
from sky.backends import backend_utils
|
|
43
|
+
from sky.backends import task_codegen
|
|
42
44
|
from sky.backends import wheel_utils
|
|
43
45
|
from sky.clouds import cloud as sky_cloud
|
|
44
46
|
from sky.clouds.utils import gcp_utils
|
|
@@ -48,14 +50,15 @@ from sky.provision import common as provision_common
|
|
|
48
50
|
from sky.provision import instance_setup
|
|
49
51
|
from sky.provision import metadata_utils
|
|
50
52
|
from sky.provision import provisioner
|
|
53
|
+
from sky.provision.kubernetes import config as config_lib
|
|
51
54
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
55
|
+
from sky.serve import constants as serve_constants
|
|
52
56
|
from sky.server.requests import requests as requests_lib
|
|
53
57
|
from sky.skylet import autostop_lib
|
|
54
58
|
from sky.skylet import constants
|
|
55
59
|
from sky.skylet import job_lib
|
|
56
60
|
from sky.skylet import log_lib
|
|
57
61
|
from sky.usage import usage_lib
|
|
58
|
-
from sky.utils import accelerator_registry
|
|
59
62
|
from sky.utils import annotations
|
|
60
63
|
from sky.utils import cluster_utils
|
|
61
64
|
from sky.utils import command_runner
|
|
@@ -85,13 +88,34 @@ if typing.TYPE_CHECKING:
|
|
|
85
88
|
from sky import dag
|
|
86
89
|
from sky.schemas.generated import autostopv1_pb2
|
|
87
90
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
93
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
95
|
+
from sky.schemas.generated import servev1_pb2
|
|
96
|
+
from sky.schemas.generated import servev1_pb2_grpc
|
|
88
97
|
else:
|
|
89
98
|
# To avoid requiring grpcio to be installed on the client side.
|
|
90
|
-
grpc = adaptors_common.LazyImport(
|
|
99
|
+
grpc = adaptors_common.LazyImport(
|
|
100
|
+
'grpc',
|
|
101
|
+
# https://github.com/grpc/grpc/issues/37642 to avoid spam in console
|
|
102
|
+
set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
|
|
103
|
+
if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
|
|
91
104
|
autostopv1_pb2 = adaptors_common.LazyImport(
|
|
92
105
|
'sky.schemas.generated.autostopv1_pb2')
|
|
93
106
|
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
94
107
|
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
108
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
109
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
110
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
111
|
+
servev1_pb2 = adaptors_common.LazyImport(
|
|
112
|
+
'sky.schemas.generated.servev1_pb2')
|
|
113
|
+
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
114
|
+
'sky.schemas.generated.servev1_pb2_grpc')
|
|
115
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
116
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
117
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
118
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
95
119
|
|
|
96
120
|
Path = str
|
|
97
121
|
|
|
@@ -113,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
|
|
|
113
137
|
clouds.OCI: 300,
|
|
114
138
|
clouds.Paperspace: 600,
|
|
115
139
|
clouds.Kubernetes: 300,
|
|
140
|
+
clouds.Shadeform: 300,
|
|
116
141
|
clouds.Vsphere: 240,
|
|
117
142
|
}
|
|
118
143
|
|
|
@@ -179,6 +204,12 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
179
204
|
# We use 100KB as a threshold to be safe for other arguments that
|
|
180
205
|
# might be added during ssh.
|
|
181
206
|
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
207
|
+
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
208
|
+
('too long', 255),
|
|
209
|
+
('request-uri too large', 1),
|
|
210
|
+
('request header fields too large', 1),
|
|
211
|
+
('400 bad request', 1), # CloudFlare 400 error
|
|
212
|
+
]
|
|
182
213
|
|
|
183
214
|
_RESOURCES_UNAVAILABLE_LOG = (
|
|
184
215
|
'Reasons for provision failures (for details, please check the log above):')
|
|
@@ -199,6 +230,61 @@ def _is_command_length_over_limit(command: str) -> bool:
|
|
|
199
230
|
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
200
231
|
|
|
201
232
|
|
|
233
|
+
def _is_message_too_long(returncode: int,
|
|
234
|
+
output: Optional[str] = None,
|
|
235
|
+
file_path: Optional[str] = None) -> bool:
|
|
236
|
+
"""Check if the message sent to the remote is too long.
|
|
237
|
+
|
|
238
|
+
We use inline script to run the setup or run command, i.e. the script will
|
|
239
|
+
be part of the message sent to the remote cluster. There is a chance that
|
|
240
|
+
the command is too long, when people has very long run or setup commands, or
|
|
241
|
+
there is a cloudflare proxy in front of the remote blocking the long
|
|
242
|
+
message. Several common causes are:
|
|
243
|
+
- SSH returning: `too long` in the error message.
|
|
244
|
+
- Cloudflare proxy returning: `414 Request-URI Too Large` or
|
|
245
|
+
`431 Request Header Fields Too Large` error.
|
|
246
|
+
|
|
247
|
+
We use a general length limit check before but it could be inaccurate on
|
|
248
|
+
some systems, e.g. cloudflare proxy, so this is necessary.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
returncode: The return code of the setup command.
|
|
252
|
+
output: The output of the setup command.
|
|
253
|
+
file_path: The path to the setup log file.
|
|
254
|
+
"""
|
|
255
|
+
assert (output is None) != (file_path is None), (
|
|
256
|
+
'Either output or file_path must be provided.', output, file_path)
|
|
257
|
+
to_check = []
|
|
258
|
+
for (match_str,
|
|
259
|
+
desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
|
|
260
|
+
if desired_rc == returncode:
|
|
261
|
+
to_check.append(match_str)
|
|
262
|
+
if not to_check:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
def _check_output_for_match_str(output: str) -> bool:
|
|
266
|
+
for match_str in to_check:
|
|
267
|
+
if match_str.lower() in output.lower():
|
|
268
|
+
return True
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
if file_path is not None:
|
|
272
|
+
try:
|
|
273
|
+
with open(os.path.expanduser(file_path), 'r',
|
|
274
|
+
encoding='utf-8') as f:
|
|
275
|
+
content = f.read()
|
|
276
|
+
return _check_output_for_match_str(content)
|
|
277
|
+
except Exception as e: # pylint: disable=broad-except
|
|
278
|
+
# We don't crash the setup if we cannot read the log file.
|
|
279
|
+
# Instead, we should retry the setup with dumping the script
|
|
280
|
+
# to a file to be safe.
|
|
281
|
+
logger.debug(f'Failed to read setup log file {file_path}: {e}')
|
|
282
|
+
return True
|
|
283
|
+
else:
|
|
284
|
+
assert output is not None, (output, file_path)
|
|
285
|
+
return _check_output_for_match_str(output)
|
|
286
|
+
|
|
287
|
+
|
|
202
288
|
def _get_cluster_config_template(cloud):
|
|
203
289
|
cloud_to_template = {
|
|
204
290
|
clouds.AWS: 'aws-ray.yml.j2',
|
|
@@ -210,15 +296,18 @@ def _get_cluster_config_template(cloud):
|
|
|
210
296
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
211
297
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
212
298
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
299
|
+
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
213
300
|
clouds.DO: 'do-ray.yml.j2',
|
|
214
301
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
|
215
302
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
|
216
303
|
clouds.SSH: 'kubernetes-ray.yml.j2',
|
|
304
|
+
clouds.Shadeform: 'shadeform-ray.yml.j2',
|
|
217
305
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
|
218
306
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
219
307
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
220
308
|
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
221
|
-
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
|
|
309
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
310
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
222
311
|
}
|
|
223
312
|
return cloud_to_template[type(cloud)]
|
|
224
313
|
|
|
@@ -248,511 +337,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
|
|
|
248
337
|
return f.name
|
|
249
338
|
|
|
250
339
|
|
|
251
|
-
class RayCodeGen:
|
|
252
|
-
"""Code generator of a Ray program that executes a sky.Task.
|
|
253
|
-
|
|
254
|
-
Usage:
|
|
255
|
-
|
|
256
|
-
>> codegen = RayCodegen()
|
|
257
|
-
>> codegen.add_prologue()
|
|
258
|
-
|
|
259
|
-
>> codegen.add_ray_task(...)
|
|
260
|
-
>> codegen.add_ray_task(...)
|
|
261
|
-
|
|
262
|
-
>> codegen.add_epilogue()
|
|
263
|
-
>> code = codegen.build()
|
|
264
|
-
"""
|
|
265
|
-
|
|
266
|
-
def __init__(self):
|
|
267
|
-
# Code generated so far, to be joined via '\n'.
|
|
268
|
-
self._code = []
|
|
269
|
-
# Guard method calling order.
|
|
270
|
-
self._has_prologue = False
|
|
271
|
-
self._has_epilogue = False
|
|
272
|
-
|
|
273
|
-
# For n nodes gang scheduling.
|
|
274
|
-
self._has_gang_scheduling = False
|
|
275
|
-
self._num_nodes = 0
|
|
276
|
-
|
|
277
|
-
self._has_register_run_fn = False
|
|
278
|
-
|
|
279
|
-
# job_id
|
|
280
|
-
# Job ID is used to identify the job (also this generated code).
|
|
281
|
-
# It is a int automatically generated by the DB on the cluster
|
|
282
|
-
# and monotonically increasing starting from 1.
|
|
283
|
-
# To generate the job ID, we use the following logic:
|
|
284
|
-
# code = job_lib.JobLibCodeGen.add_job(username,
|
|
285
|
-
# run_timestamp)
|
|
286
|
-
# job_id = get_output(run_on_cluster(code))
|
|
287
|
-
self.job_id = None
|
|
288
|
-
|
|
289
|
-
def add_prologue(self, job_id: int) -> None:
|
|
290
|
-
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
291
|
-
self._has_prologue = True
|
|
292
|
-
self.job_id = job_id
|
|
293
|
-
# Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
|
|
294
|
-
# 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
|
|
295
|
-
# Otherwise, ray will fail to get the placement group because of a bug
|
|
296
|
-
# in ray job.
|
|
297
|
-
ray_address = 'auto'
|
|
298
|
-
self._code = [
|
|
299
|
-
textwrap.dedent(f"""\
|
|
300
|
-
import functools
|
|
301
|
-
import getpass
|
|
302
|
-
import hashlib
|
|
303
|
-
import io
|
|
304
|
-
import os
|
|
305
|
-
import pathlib
|
|
306
|
-
import selectors
|
|
307
|
-
import shlex
|
|
308
|
-
import subprocess
|
|
309
|
-
import sys
|
|
310
|
-
import tempfile
|
|
311
|
-
import textwrap
|
|
312
|
-
import time
|
|
313
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
314
|
-
|
|
315
|
-
# Set the environment variables to avoid deduplicating logs and
|
|
316
|
-
# scheduler events. This should be set in driver code, since we are
|
|
317
|
-
# not using `ray job submit` anymore, and the environment variables
|
|
318
|
-
# from the ray cluster is not inherited.
|
|
319
|
-
os.environ['RAY_DEDUP_LOGS'] = '0'
|
|
320
|
-
os.environ['RAY_SCHEDULER_EVENTS'] = '0'
|
|
321
|
-
|
|
322
|
-
import ray
|
|
323
|
-
import ray.util as ray_util
|
|
324
|
-
|
|
325
|
-
from sky.skylet import autostop_lib
|
|
326
|
-
from sky.skylet import constants
|
|
327
|
-
from sky.skylet import job_lib
|
|
328
|
-
from sky.utils import log_utils
|
|
329
|
-
from sky.utils import subprocess_utils
|
|
330
|
-
|
|
331
|
-
SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
|
|
332
|
-
|
|
333
|
-
kwargs = dict()
|
|
334
|
-
# Only set the `_temp_dir` to SkyPilot's ray cluster directory when
|
|
335
|
-
# the directory exists for backward compatibility for the VM
|
|
336
|
-
# launched before #1790.
|
|
337
|
-
if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
|
|
338
|
-
kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
|
|
339
|
-
ray.init(
|
|
340
|
-
address={ray_address!r},
|
|
341
|
-
namespace='__sky__{job_id}__',
|
|
342
|
-
log_to_driver=True,
|
|
343
|
-
**kwargs
|
|
344
|
-
)
|
|
345
|
-
def get_or_fail(futures, pg) -> List[int]:
|
|
346
|
-
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
|
347
|
-
if not futures:
|
|
348
|
-
return []
|
|
349
|
-
returncodes = [1] * len(futures)
|
|
350
|
-
# Wait for 1 task to be ready.
|
|
351
|
-
ready = []
|
|
352
|
-
# Keep invoking ray.wait if ready is empty. This is because
|
|
353
|
-
# ray.wait with timeout=None will only wait for 10**6 seconds,
|
|
354
|
-
# which will cause tasks running for more than 12 days to return
|
|
355
|
-
# before becoming ready.
|
|
356
|
-
# (Such tasks are common in serving jobs.)
|
|
357
|
-
# Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
|
|
358
|
-
while not ready:
|
|
359
|
-
ready, unready = ray.wait(futures)
|
|
360
|
-
idx = futures.index(ready[0])
|
|
361
|
-
returncodes[idx] = ray.get(ready[0])
|
|
362
|
-
while unready:
|
|
363
|
-
if returncodes[idx] != 0:
|
|
364
|
-
for task in unready:
|
|
365
|
-
# ray.cancel without force fails to kill tasks.
|
|
366
|
-
# We use force=True to kill unready tasks.
|
|
367
|
-
ray.cancel(task, force=True)
|
|
368
|
-
# Use SIGKILL=128+9 to indicate the task is forcely
|
|
369
|
-
# killed.
|
|
370
|
-
idx = futures.index(task)
|
|
371
|
-
returncodes[idx] = 137
|
|
372
|
-
break
|
|
373
|
-
ready, unready = ray.wait(unready)
|
|
374
|
-
idx = futures.index(ready[0])
|
|
375
|
-
returncodes[idx] = ray.get(ready[0])
|
|
376
|
-
# Remove the placement group after all tasks are done, so that
|
|
377
|
-
# the next job can be scheduled on the released resources
|
|
378
|
-
# immediately.
|
|
379
|
-
ray_util.remove_placement_group(pg)
|
|
380
|
-
sys.stdout.flush()
|
|
381
|
-
return returncodes
|
|
382
|
-
|
|
383
|
-
run_fn = None
|
|
384
|
-
futures = []
|
|
385
|
-
"""),
|
|
386
|
-
# FIXME: This is a hack to make sure that the functions can be found
|
|
387
|
-
# by ray.remote. This should be removed once we have a better way to
|
|
388
|
-
# specify dependencies for ray.
|
|
389
|
-
inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
|
|
390
|
-
inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
|
|
391
|
-
inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
|
|
392
|
-
inspect.getsource(log_lib.process_subprocess_stream),
|
|
393
|
-
inspect.getsource(log_lib.run_with_log),
|
|
394
|
-
inspect.getsource(log_lib.make_task_bash_script),
|
|
395
|
-
inspect.getsource(log_lib.add_ray_env_vars),
|
|
396
|
-
inspect.getsource(log_lib.run_bash_command_with_log),
|
|
397
|
-
'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
|
|
398
|
-
]
|
|
399
|
-
# Currently, the codegen program is/can only be submitted to the head
|
|
400
|
-
# node, due to using job_lib for updating job statuses, and using
|
|
401
|
-
# autostop_lib here.
|
|
402
|
-
self._code.append(
|
|
403
|
-
# Use hasattr to handle backward compatibility.
|
|
404
|
-
# TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
|
|
405
|
-
textwrap.dedent("""\
|
|
406
|
-
if hasattr(autostop_lib, 'set_last_active_time_to_now'):
|
|
407
|
-
autostop_lib.set_last_active_time_to_now()
|
|
408
|
-
"""))
|
|
409
|
-
self._code += [
|
|
410
|
-
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
411
|
-
]
|
|
412
|
-
|
|
413
|
-
def add_gang_scheduling_placement_group_and_setup(
|
|
414
|
-
self,
|
|
415
|
-
num_nodes: int,
|
|
416
|
-
resources_dict: Dict[str, float],
|
|
417
|
-
stable_cluster_internal_ips: List[str],
|
|
418
|
-
env_vars: Dict[str, str],
|
|
419
|
-
setup_cmd: Optional[str] = None,
|
|
420
|
-
setup_log_path: Optional[str] = None,
|
|
421
|
-
) -> None:
|
|
422
|
-
"""Create the gang scheduling placement group for a Task.
|
|
423
|
-
|
|
424
|
-
cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
|
|
425
|
-
variable is assigned in a deterministic order whenever a new task is
|
|
426
|
-
added.
|
|
427
|
-
"""
|
|
428
|
-
assert self._has_prologue, (
|
|
429
|
-
'Call add_prologue() before '
|
|
430
|
-
'add_gang_scheduling_placement_group_and_setup().')
|
|
431
|
-
self._has_gang_scheduling = True
|
|
432
|
-
self._num_nodes = num_nodes
|
|
433
|
-
|
|
434
|
-
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
435
|
-
# Set CPU to avoid ray hanging the resources allocation
|
|
436
|
-
# for remote functions, since the task will request 1 CPU
|
|
437
|
-
# by default.
|
|
438
|
-
task_cpu_demand = resources_dict.pop('CPU')
|
|
439
|
-
|
|
440
|
-
if resources_dict:
|
|
441
|
-
assert len(resources_dict) == 1, (
|
|
442
|
-
'There can only be one type of accelerator per instance. '
|
|
443
|
-
f'Found: {resources_dict}.')
|
|
444
|
-
acc_name, acc_count = list(resources_dict.items())[0]
|
|
445
|
-
gpu_dict = {'GPU': acc_count}
|
|
446
|
-
# gpu_dict should be empty when the accelerator is not GPU.
|
|
447
|
-
# TODO(zongheng,zhanghao): an alternative is to start the remote
|
|
448
|
-
# cluster with custom resource 'GPU': <n> even if the accelerator(s)
|
|
449
|
-
# are not GPU. We opt for the current solution for now.
|
|
450
|
-
if accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
451
|
-
acc_name):
|
|
452
|
-
gpu_dict = {}
|
|
453
|
-
for bundle in bundles:
|
|
454
|
-
bundle.update({
|
|
455
|
-
# Set the GPU to avoid ray hanging the resources allocation
|
|
456
|
-
**gpu_dict,
|
|
457
|
-
})
|
|
458
|
-
|
|
459
|
-
streaming_message = (
|
|
460
|
-
f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
|
|
461
|
-
f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
|
|
462
|
-
f'be killed){colorama.Style.RESET_ALL}')
|
|
463
|
-
self._code += [
|
|
464
|
-
textwrap.dedent(f"""\
|
|
465
|
-
pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
|
|
466
|
-
plural = 's' if {num_nodes} > 1 else ''
|
|
467
|
-
node_str = f'{num_nodes} node{{plural}}'
|
|
468
|
-
message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
|
|
469
|
-
'Waiting for task resources on '
|
|
470
|
-
f'{{node_str}}.{colorama.Style.RESET_ALL}')
|
|
471
|
-
print(message, flush=True)
|
|
472
|
-
# FIXME: This will print the error message from autoscaler if
|
|
473
|
-
# it is waiting for other task to finish. We should hide the
|
|
474
|
-
# error message.
|
|
475
|
-
ray.get(pg.ready())
|
|
476
|
-
print({streaming_message!r}, flush=True)
|
|
477
|
-
""")
|
|
478
|
-
]
|
|
479
|
-
|
|
480
|
-
job_id = self.job_id
|
|
481
|
-
if setup_cmd is not None:
|
|
482
|
-
setup_envs = env_vars.copy()
|
|
483
|
-
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
484
|
-
self._code += [
|
|
485
|
-
textwrap.dedent(f"""\
|
|
486
|
-
setup_cmd = {setup_cmd!r}
|
|
487
|
-
_SETUP_CPUS = 0.0001
|
|
488
|
-
# The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
|
|
489
|
-
# requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
|
|
490
|
-
# We unset it so that user setup command may properly use this env var.
|
|
491
|
-
setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
|
|
492
|
-
job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
493
|
-
|
|
494
|
-
# The schedule_step should be called after the job status is set to non-PENDING,
|
|
495
|
-
# otherwise, the scheduler will think the current job is not submitted yet, and
|
|
496
|
-
# skip the scheduling step.
|
|
497
|
-
job_lib.scheduler.schedule_step()
|
|
498
|
-
|
|
499
|
-
total_num_nodes = len(ray.nodes())
|
|
500
|
-
setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
|
|
501
|
-
setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
|
|
502
|
-
setup_workers = [run_bash_command_with_log \\
|
|
503
|
-
.options(
|
|
504
|
-
name='setup',
|
|
505
|
-
num_cpus=_SETUP_CPUS,
|
|
506
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
507
|
-
placement_group=setup_pg,
|
|
508
|
-
placement_group_bundle_index=i)
|
|
509
|
-
) \\
|
|
510
|
-
.remote(
|
|
511
|
-
setup_cmd,
|
|
512
|
-
os.path.expanduser({setup_log_path!r}),
|
|
513
|
-
env_vars={setup_envs!r},
|
|
514
|
-
stream_logs=True,
|
|
515
|
-
with_ray=True,
|
|
516
|
-
) for i in range(total_num_nodes)]
|
|
517
|
-
setup_returncodes = get_or_fail(setup_workers, setup_pg)
|
|
518
|
-
if sum(setup_returncodes) != 0:
|
|
519
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
520
|
-
# This waits for all streaming logs to finish.
|
|
521
|
-
time.sleep(1)
|
|
522
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
|
|
523
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
524
|
-
setup_returncodes,
|
|
525
|
-
flush=True)
|
|
526
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
527
|
-
sys.exit(1)
|
|
528
|
-
""")
|
|
529
|
-
]
|
|
530
|
-
|
|
531
|
-
self._code.append(f'job_lib.set_job_started({self.job_id!r})')
|
|
532
|
-
if setup_cmd is None:
|
|
533
|
-
# Need to call schedule_step() to make sure the scheduler
|
|
534
|
-
# schedule the next pending job.
|
|
535
|
-
self._code.append('job_lib.scheduler.schedule_step()')
|
|
536
|
-
|
|
537
|
-
# Export IP and node rank to the environment variables.
|
|
538
|
-
self._code += [
|
|
539
|
-
textwrap.dedent(f"""\
|
|
540
|
-
@ray.remote
|
|
541
|
-
def check_ip():
|
|
542
|
-
return ray.util.get_node_ip_address()
|
|
543
|
-
gang_scheduling_id_to_ip = ray.get([
|
|
544
|
-
check_ip.options(
|
|
545
|
-
num_cpus={task_cpu_demand},
|
|
546
|
-
scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
|
|
547
|
-
placement_group=pg,
|
|
548
|
-
placement_group_bundle_index=i
|
|
549
|
-
)).remote()
|
|
550
|
-
for i in range(pg.bundle_count)
|
|
551
|
-
])
|
|
552
|
-
|
|
553
|
-
cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
|
|
554
|
-
job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
|
|
555
|
-
job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
|
|
556
|
-
job_ip_list_str = '\\n'.join(job_ip_rank_list)
|
|
557
|
-
"""),
|
|
558
|
-
]
|
|
559
|
-
|
|
560
|
-
def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
|
|
561
|
-
"""Register the run function to be run on the remote cluster.
|
|
562
|
-
|
|
563
|
-
Args:
|
|
564
|
-
run_fn: The run function to be run on the remote cluster.
|
|
565
|
-
"""
|
|
566
|
-
assert self._has_gang_scheduling, (
|
|
567
|
-
'Call add_gang_scheduling_placement_group_and_setup() '
|
|
568
|
-
'before register_run_fn().')
|
|
569
|
-
assert not self._has_register_run_fn, (
|
|
570
|
-
'register_run_fn() called twice?')
|
|
571
|
-
self._has_register_run_fn = True
|
|
572
|
-
|
|
573
|
-
self._code += [
|
|
574
|
-
run_fn,
|
|
575
|
-
f'run_fn = {run_fn_name}',
|
|
576
|
-
]
|
|
577
|
-
|
|
578
|
-
def add_ray_task(self,
|
|
579
|
-
bash_script: Optional[str],
|
|
580
|
-
task_name: Optional[str],
|
|
581
|
-
ray_resources_dict: Dict[str, float],
|
|
582
|
-
log_dir: str,
|
|
583
|
-
env_vars: Optional[Dict[str, str]] = None,
|
|
584
|
-
gang_scheduling_id: int = 0) -> None:
|
|
585
|
-
"""Generates code for a ray remote task that runs a bash command."""
|
|
586
|
-
assert self._has_gang_scheduling, (
|
|
587
|
-
'Call add_gang_scheduling_placement_group_and_setup() before '
|
|
588
|
-
'add_ray_task().')
|
|
589
|
-
assert (not self._has_register_run_fn or
|
|
590
|
-
bash_script is None), ('bash_script should '
|
|
591
|
-
'be None when run_fn is registered.')
|
|
592
|
-
task_cpu_demand = ray_resources_dict.pop('CPU')
|
|
593
|
-
# Build remote_task.options(...)
|
|
594
|
-
# resources=...
|
|
595
|
-
# num_gpus=...
|
|
596
|
-
options = []
|
|
597
|
-
options.append(f'num_cpus={task_cpu_demand}')
|
|
598
|
-
|
|
599
|
-
num_gpus = 0.0
|
|
600
|
-
if ray_resources_dict:
|
|
601
|
-
assert len(ray_resources_dict) == 1, (
|
|
602
|
-
'There can only be one type of accelerator per instance. '
|
|
603
|
-
f'Found: {ray_resources_dict}.')
|
|
604
|
-
num_gpus = list(ray_resources_dict.values())[0]
|
|
605
|
-
options.append(f'resources={json.dumps(ray_resources_dict)}')
|
|
606
|
-
|
|
607
|
-
resources_key = list(ray_resources_dict.keys())[0]
|
|
608
|
-
if not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
609
|
-
resources_key):
|
|
610
|
-
# `num_gpus` should be empty when the accelerator is not GPU.
|
|
611
|
-
# FIXME: use a set of GPU types, instead of 'tpu' in the key.
|
|
612
|
-
|
|
613
|
-
# Passing this ensures that the Ray remote task gets
|
|
614
|
-
# CUDA_VISIBLE_DEVICES set correctly. If not passed, that flag
|
|
615
|
-
# would be force-set to empty by Ray.
|
|
616
|
-
options.append(f'num_gpus={num_gpus}')
|
|
617
|
-
options.append(
|
|
618
|
-
'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
|
|
619
|
-
'placement_group=pg, '
|
|
620
|
-
f'placement_group_bundle_index={gang_scheduling_id})')
|
|
621
|
-
|
|
622
|
-
sky_env_vars_dict_str = [
|
|
623
|
-
textwrap.dedent(f"""\
|
|
624
|
-
sky_env_vars_dict = {{}}
|
|
625
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
|
|
626
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
|
|
627
|
-
""")
|
|
628
|
-
]
|
|
629
|
-
|
|
630
|
-
if env_vars is not None:
|
|
631
|
-
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
632
|
-
for k, v in env_vars.items())
|
|
633
|
-
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
634
|
-
|
|
635
|
-
options_str = ', '.join(options)
|
|
636
|
-
logger.debug('Added Task with options: '
|
|
637
|
-
f'{options_str}')
|
|
638
|
-
# Script to block completion of a job until all storage mounted with
|
|
639
|
-
# CACHED_MOUNT mode is uploaded to remote.
|
|
640
|
-
rclone_flush_script = textwrap.dedent(f"""\
|
|
641
|
-
|
|
642
|
-
# Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
|
|
643
|
-
# findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
|
|
644
|
-
# rclone for normal mounts as well.
|
|
645
|
-
if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
|
|
646
|
-
[ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
|
|
647
|
-
[ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
|
|
648
|
-
flushed=0
|
|
649
|
-
# extra second on top of --vfs-cache-poll-interval to
|
|
650
|
-
# avoid race condition between rclone log line creation and this check.
|
|
651
|
-
sleep 1
|
|
652
|
-
while [ $flushed -eq 0 ]; do
|
|
653
|
-
# sleep for the same interval as --vfs-cache-poll-interval
|
|
654
|
-
sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
|
|
655
|
-
flushed=1
|
|
656
|
-
for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
|
|
657
|
-
exitcode=0
|
|
658
|
-
tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
|
|
659
|
-
if [ $exitcode -ne 0 ]; then
|
|
660
|
-
echo "skypilot: cached mount is still uploading to remote"
|
|
661
|
-
flushed=0
|
|
662
|
-
break
|
|
663
|
-
fi
|
|
664
|
-
done
|
|
665
|
-
done
|
|
666
|
-
echo "skypilot: cached mount uploaded complete"
|
|
667
|
-
fi""")
|
|
668
|
-
self._code += [
|
|
669
|
-
sky_env_vars_dict_str,
|
|
670
|
-
textwrap.dedent(f"""\
|
|
671
|
-
script = {bash_script!r}
|
|
672
|
-
rclone_flush_script = {rclone_flush_script!r}
|
|
673
|
-
if run_fn is not None:
|
|
674
|
-
script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
|
|
675
|
-
|
|
676
|
-
if script is not None:
|
|
677
|
-
script += rclone_flush_script
|
|
678
|
-
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
|
|
679
|
-
|
|
680
|
-
ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
|
|
681
|
-
rank = job_ip_rank_map[ip]
|
|
682
|
-
|
|
683
|
-
if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
|
|
684
|
-
name_str = '{task_name},' if {task_name!r} != None else 'task,'
|
|
685
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
|
|
686
|
-
else: # Single-node or multi-node task on multi-node cluster
|
|
687
|
-
idx_in_cluster = cluster_ips_to_node_id[ip]
|
|
688
|
-
if cluster_ips_to_node_id[ip] == 0:
|
|
689
|
-
node_name = 'head'
|
|
690
|
-
else:
|
|
691
|
-
node_name = f'worker{{idx_in_cluster}}'
|
|
692
|
-
name_str = f'{{node_name}}, rank={{rank}},'
|
|
693
|
-
log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
|
|
694
|
-
sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
|
|
695
|
-
|
|
696
|
-
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
697
|
-
|
|
698
|
-
futures.append(run_bash_command_with_log \\
|
|
699
|
-
.options(name=name_str, {options_str}) \\
|
|
700
|
-
.remote(
|
|
701
|
-
script,
|
|
702
|
-
log_path,
|
|
703
|
-
env_vars=sky_env_vars_dict,
|
|
704
|
-
stream_logs=True,
|
|
705
|
-
with_ray=True,
|
|
706
|
-
))""")
|
|
707
|
-
]
|
|
708
|
-
|
|
709
|
-
def add_epilogue(self) -> None:
|
|
710
|
-
"""Generates code that waits for all tasks, then exits."""
|
|
711
|
-
assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
|
|
712
|
-
assert not self._has_epilogue, 'add_epilogue() called twice?'
|
|
713
|
-
self._has_epilogue = True
|
|
714
|
-
|
|
715
|
-
self._code += [
|
|
716
|
-
textwrap.dedent(f"""\
|
|
717
|
-
returncodes = get_or_fail(futures, pg)
|
|
718
|
-
if sum(returncodes) != 0:
|
|
719
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
|
|
720
|
-
# Schedule the next pending job immediately to make the job
|
|
721
|
-
# scheduling more efficient.
|
|
722
|
-
job_lib.scheduler.schedule_step()
|
|
723
|
-
# This waits for all streaming logs to finish.
|
|
724
|
-
time.sleep(0.5)
|
|
725
|
-
reason = ''
|
|
726
|
-
# 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
|
|
727
|
-
if any(r == 139 for r in returncodes):
|
|
728
|
-
reason = '(likely due to Segmentation Fault)'
|
|
729
|
-
if any(r == 137 for r in returncodes):
|
|
730
|
-
# Find the first non-137 return code
|
|
731
|
-
non_137 = next(r for r in returncodes if r != 137)
|
|
732
|
-
reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
|
|
733
|
-
print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
|
|
734
|
-
'return code list:{colorama.Style.RESET_ALL}',
|
|
735
|
-
returncodes,
|
|
736
|
-
reason,
|
|
737
|
-
flush=True)
|
|
738
|
-
# Need this to set the job status in ray job to be FAILED.
|
|
739
|
-
sys.exit(1)
|
|
740
|
-
else:
|
|
741
|
-
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
|
|
742
|
-
# Schedule the next pending job immediately to make the job
|
|
743
|
-
# scheduling more efficient.
|
|
744
|
-
job_lib.scheduler.schedule_step()
|
|
745
|
-
# This waits for all streaming logs to finish.
|
|
746
|
-
time.sleep(0.5)
|
|
747
|
-
""")
|
|
748
|
-
]
|
|
749
|
-
|
|
750
|
-
def build(self) -> str:
|
|
751
|
-
"""Returns the entire generated program."""
|
|
752
|
-
assert self._has_epilogue, 'Call add_epilogue() before build().'
|
|
753
|
-
return '\n'.join(self._code)
|
|
754
|
-
|
|
755
|
-
|
|
756
340
|
class GangSchedulingStatus(enum.Enum):
|
|
757
341
|
"""Enum for gang scheduling status."""
|
|
758
342
|
CLUSTER_READY = 0
|
|
@@ -1340,6 +924,34 @@ class RetryingVmProvisioner(object):
|
|
|
1340
924
|
zones = [clouds.Zone(name=to_provision.zone)]
|
|
1341
925
|
yield zones
|
|
1342
926
|
|
|
927
|
+
def _insufficient_resources_msg(
|
|
928
|
+
self,
|
|
929
|
+
to_provision: resources_lib.Resources,
|
|
930
|
+
requested_resources: Set[resources_lib.Resources],
|
|
931
|
+
insufficient_resources: Optional[List[str]],
|
|
932
|
+
) -> str:
|
|
933
|
+
insufficent_resource_msg = ('' if insufficient_resources is None else
|
|
934
|
+
f' ({", ".join(insufficient_resources)})')
|
|
935
|
+
message = f'Failed to acquire resources{insufficent_resource_msg} '
|
|
936
|
+
if to_provision.zone is not None:
|
|
937
|
+
message += (f'in {to_provision.zone} for {requested_resources}. ')
|
|
938
|
+
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
939
|
+
# For public clouds, provision.region is always set.
|
|
940
|
+
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
941
|
+
message += (
|
|
942
|
+
f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
|
|
943
|
+
f'for {requested_resources}. The SSH Node Pool may not '
|
|
944
|
+
'have enough resources.')
|
|
945
|
+
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
946
|
+
message += (f'in context {to_provision.region} for '
|
|
947
|
+
f'{requested_resources}. ')
|
|
948
|
+
else:
|
|
949
|
+
message += (f'in all zones in {to_provision.region} for '
|
|
950
|
+
f'{requested_resources}. ')
|
|
951
|
+
else:
|
|
952
|
+
message += (f'{to_provision.cloud} for {requested_resources}. ')
|
|
953
|
+
return message
|
|
954
|
+
|
|
1343
955
|
def _retry_zones(
|
|
1344
956
|
self,
|
|
1345
957
|
to_provision: resources_lib.Resources,
|
|
@@ -1418,6 +1030,7 @@ class RetryingVmProvisioner(object):
|
|
|
1418
1030
|
f'To request quotas, check the instruction: '
|
|
1419
1031
|
f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
|
|
1420
1032
|
|
|
1033
|
+
insufficient_resources = None
|
|
1421
1034
|
for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
|
|
1422
1035
|
prev_cluster_status,
|
|
1423
1036
|
prev_cluster_ever_up):
|
|
@@ -1630,6 +1243,24 @@ class RetryingVmProvisioner(object):
|
|
|
1630
1243
|
# No teardown happens for this error.
|
|
1631
1244
|
with ux_utils.print_exception_no_traceback():
|
|
1632
1245
|
raise
|
|
1246
|
+
except config_lib.KubernetesError as e:
|
|
1247
|
+
if e.insufficent_resources:
|
|
1248
|
+
insufficient_resources = e.insufficent_resources
|
|
1249
|
+
# NOTE: We try to cleanup the cluster even if the previous
|
|
1250
|
+
# cluster does not exist. Also we are fast at
|
|
1251
|
+
# cleaning up clusters now if there is no existing node.
|
|
1252
|
+
CloudVmRayBackend().post_teardown_cleanup(
|
|
1253
|
+
handle,
|
|
1254
|
+
terminate=not prev_cluster_ever_up,
|
|
1255
|
+
remove_from_db=False,
|
|
1256
|
+
failover=True,
|
|
1257
|
+
)
|
|
1258
|
+
# TODO(suquark): other clouds may have different zone
|
|
1259
|
+
# blocking strategy. See '_update_blocklist_on_error'
|
|
1260
|
+
# for details.
|
|
1261
|
+
FailoverCloudErrorHandlerV2.update_blocklist_on_error(
|
|
1262
|
+
self._blocked_resources, to_provision, region, zones, e)
|
|
1263
|
+
continue
|
|
1633
1264
|
except Exception as e: # pylint: disable=broad-except
|
|
1634
1265
|
# NOTE: We try to cleanup the cluster even if the previous
|
|
1635
1266
|
# cluster does not exist. Also we are fast at
|
|
@@ -1760,26 +1391,9 @@ class RetryingVmProvisioner(object):
|
|
|
1760
1391
|
terminate=terminate_or_stop,
|
|
1761
1392
|
remove_from_db=False)
|
|
1762
1393
|
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
f'{requested_resources}. ')
|
|
1767
|
-
elif to_provision.region is not None:
|
|
1768
|
-
# For public clouds, provision.region is always set.
|
|
1769
|
-
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
1770
|
-
message = ('Failed to acquire resources in SSH Node Pool '
|
|
1771
|
-
f'({to_provision.region.lstrip("ssh-")}) for '
|
|
1772
|
-
f'{requested_resources}. The SSH Node Pool may not '
|
|
1773
|
-
'have enough resources.')
|
|
1774
|
-
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
1775
|
-
message = ('Failed to acquire resources in context '
|
|
1776
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1777
|
-
else:
|
|
1778
|
-
message = ('Failed to acquire resources in all zones in '
|
|
1779
|
-
f'{to_provision.region} for {requested_resources}. ')
|
|
1780
|
-
else:
|
|
1781
|
-
message = (f'Failed to acquire resources in {to_provision.cloud} '
|
|
1782
|
-
f'for {requested_resources}. ')
|
|
1394
|
+
message = self._insufficient_resources_msg(to_provision,
|
|
1395
|
+
requested_resources,
|
|
1396
|
+
insufficient_resources)
|
|
1783
1397
|
# Do not failover to other locations if the cluster was ever up, since
|
|
1784
1398
|
# the user can have some data on the cluster.
|
|
1785
1399
|
raise exceptions.ResourcesUnavailableError(
|
|
@@ -2175,8 +1789,6 @@ class RetryingVmProvisioner(object):
|
|
|
2175
1789
|
# terminated by _retry_zones().
|
|
2176
1790
|
assert (prev_cluster_status == status_lib.ClusterStatus.INIT
|
|
2177
1791
|
), prev_cluster_status
|
|
2178
|
-
assert global_user_state.get_handle_from_cluster_name(
|
|
2179
|
-
cluster_name) is None, cluster_name
|
|
2180
1792
|
logger.info(
|
|
2181
1793
|
ux_utils.retry_message(
|
|
2182
1794
|
f'Retrying provisioning with requested resources: '
|
|
@@ -2215,9 +1827,8 @@ class RetryingVmProvisioner(object):
|
|
|
2215
1827
|
for (resource, exception) in resource_exceptions.items():
|
|
2216
1828
|
table.add_row([
|
|
2217
1829
|
resource.infra.formatted_str(),
|
|
2218
|
-
resources_utils.format_resource(
|
|
2219
|
-
|
|
2220
|
-
exception
|
|
1830
|
+
resources_utils.format_resource(
|
|
1831
|
+
resource, simplified_only=True)[0], exception
|
|
2221
1832
|
])
|
|
2222
1833
|
# Set the max width of REASON column to 80 to avoid the table
|
|
2223
1834
|
# being wrapped in a unreadable way.
|
|
@@ -2239,6 +1850,18 @@ class SSHTunnelInfo:
|
|
|
2239
1850
|
pid: int
|
|
2240
1851
|
|
|
2241
1852
|
|
|
1853
|
+
def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
|
|
1854
|
+
try:
|
|
1855
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1856
|
+
s.settimeout(0.5)
|
|
1857
|
+
s.connect(('localhost', tunnel.port))
|
|
1858
|
+
return True
|
|
1859
|
+
except socket.error as e:
|
|
1860
|
+
logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
|
|
1861
|
+
f'{common_utils.format_exception(e)}')
|
|
1862
|
+
return False
|
|
1863
|
+
|
|
1864
|
+
|
|
2242
1865
|
class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2243
1866
|
"""A pickle-able handle to a cluster created by CloudVmRayBackend.
|
|
2244
1867
|
|
|
@@ -2261,8 +1884,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2261
1884
|
- (optional) Skylet SSH tunnel info.
|
|
2262
1885
|
"""
|
|
2263
1886
|
# Bump if any fields get added/removed/changed, and add backward
|
|
2264
|
-
#
|
|
2265
|
-
_VERSION =
|
|
1887
|
+
# compatibility logic in __setstate__ and/or __getstate__.
|
|
1888
|
+
_VERSION = 12
|
|
2266
1889
|
|
|
2267
1890
|
def __init__(
|
|
2268
1891
|
self,
|
|
@@ -2296,7 +1919,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2296
1919
|
self.launched_resources = launched_resources
|
|
2297
1920
|
self.docker_user: Optional[str] = None
|
|
2298
1921
|
self.is_grpc_enabled = True
|
|
2299
|
-
self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
|
|
2300
1922
|
|
|
2301
1923
|
def __repr__(self):
|
|
2302
1924
|
return (f'ResourceHandle('
|
|
@@ -2313,12 +1935,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2313
1935
|
f'{self.launched_resources}, '
|
|
2314
1936
|
f'\n\tdocker_user={self.docker_user},'
|
|
2315
1937
|
f'\n\tssh_user={self.ssh_user},'
|
|
2316
|
-
f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
|
|
2317
|
-
f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
|
|
1938
|
+
f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
|
|
2318
1939
|
|
|
2319
1940
|
def get_cluster_name(self):
|
|
2320
1941
|
return self.cluster_name
|
|
2321
1942
|
|
|
1943
|
+
def get_cluster_name_on_cloud(self):
|
|
1944
|
+
return self.cluster_name_on_cloud
|
|
1945
|
+
|
|
2322
1946
|
def _use_internal_ips(self):
|
|
2323
1947
|
"""Returns whether to use internal IPs for SSH connections."""
|
|
2324
1948
|
# Directly load the `use_internal_ips` flag from the cluster yaml
|
|
@@ -2345,7 +1969,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2345
1969
|
def _update_cluster_info(self):
|
|
2346
1970
|
# When a cluster is on a cloud that does not support the new
|
|
2347
1971
|
# provisioner, we should skip updating cluster_info.
|
|
2348
|
-
if (self.launched_resources.cloud
|
|
1972
|
+
if (self.launched_resources.cloud is not None and
|
|
1973
|
+
self.launched_resources.cloud.PROVISIONER_VERSION >=
|
|
2349
1974
|
clouds.ProvisionerVersion.SKYPILOT):
|
|
2350
1975
|
provider_name = str(self.launched_resources.cloud).lower()
|
|
2351
1976
|
config = {}
|
|
@@ -2643,64 +2268,199 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2643
2268
|
cluster_config_file)
|
|
2644
2269
|
self.docker_user = docker_user
|
|
2645
2270
|
|
|
2271
|
+
def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
|
|
2272
|
+
metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
|
|
2273
|
+
self.cluster_name)
|
|
2274
|
+
if metadata is None:
|
|
2275
|
+
return None
|
|
2276
|
+
return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
|
|
2277
|
+
|
|
2278
|
+
def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
|
|
2279
|
+
global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
|
|
2280
|
+
self.cluster_name,
|
|
2281
|
+
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2282
|
+
|
|
2283
|
+
def close_skylet_ssh_tunnel(self) -> None:
|
|
2284
|
+
"""Terminate the SSH tunnel process and clear its metadata."""
|
|
2285
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2286
|
+
if tunnel is None:
|
|
2287
|
+
return
|
|
2288
|
+
logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
|
|
2289
|
+
self.cluster_name, tunnel.port)
|
|
2290
|
+
try:
|
|
2291
|
+
self._terminate_ssh_tunnel_process(tunnel)
|
|
2292
|
+
finally:
|
|
2293
|
+
self._set_skylet_ssh_tunnel(None)
|
|
2294
|
+
|
|
2646
2295
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2296
|
+
grpc_options = [
|
|
2297
|
+
# The task YAMLs can be large, so the default
|
|
2298
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2299
|
+
('grpc.max_receive_message_length', -1),
|
|
2300
|
+
]
|
|
2301
|
+
# It's fine to not grab the lock here, as we're only reading,
|
|
2302
|
+
# and writes are very rare.
|
|
2303
|
+
# It's acceptable to read while another process is opening a tunnel,
|
|
2304
|
+
# because it will only happen on:
|
|
2305
|
+
# 1. A new cluster who has no tunnel yet, or
|
|
2306
|
+
# 2. A cluster with an unhealthy tunnel
|
|
2307
|
+
# For (2), for processes that read the "stale" tunnel, it will fail
|
|
2308
|
+
# and on the next retry, it will call get_grpc_channel again
|
|
2309
|
+
# and get the new tunnel.
|
|
2310
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2311
|
+
if tunnel is not None:
|
|
2312
|
+
if _is_tunnel_healthy(tunnel):
|
|
2313
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2314
|
+
options=grpc_options)
|
|
2315
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2316
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2317
|
+
|
|
2318
|
+
lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
|
|
2319
|
+
remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
|
|
2320
|
+
start_time = time.perf_counter()
|
|
2321
|
+
attempt = 1
|
|
2322
|
+
|
|
2323
|
+
def _get_remaining_timeout() -> float:
|
|
2324
|
+
return max(0.0,
|
|
2325
|
+
remaining_timeout - (time.perf_counter() - start_time))
|
|
2326
|
+
|
|
2327
|
+
while remaining_timeout > 0:
|
|
2328
|
+
logger.debug(
|
|
2329
|
+
'Attempting to acquire exclusive lock for %s (attempt %d)',
|
|
2330
|
+
lock_id, attempt)
|
|
2331
|
+
exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
|
|
2332
|
+
try:
|
|
2333
|
+
with exclusive_lock.acquire(blocking=False):
|
|
2334
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2335
|
+
logger.debug(f'Acquired exclusive lock for {lock_id} after '
|
|
2336
|
+
f'{wait_elapsed:.2f}s')
|
|
2337
|
+
try:
|
|
2338
|
+
tunnel = self._open_and_update_skylet_tunnel()
|
|
2339
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2340
|
+
options=grpc_options)
|
|
2341
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2342
|
+
# Failed to open tunnel, release the lock and retry.
|
|
2343
|
+
logger.warning(f'Failed to open tunnel for cluster '
|
|
2344
|
+
f'{self.cluster_name!r}: '
|
|
2345
|
+
f'{common_utils.format_exception(e)}')
|
|
2346
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2347
|
+
attempt += 1
|
|
2348
|
+
continue
|
|
2349
|
+
except locks.LockTimeout:
|
|
2350
|
+
pass
|
|
2651
2351
|
|
|
2652
|
-
|
|
2653
|
-
|
|
2352
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2353
|
+
logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
|
|
2354
|
+
f'waiting on shared lock (attempt {attempt})')
|
|
2355
|
+
try:
|
|
2356
|
+
# Use shared lock so that concurrent readers can
|
|
2357
|
+
# proceed in parallel.
|
|
2358
|
+
shared_lock = locks.get_lock(lock_id,
|
|
2359
|
+
remaining_timeout,
|
|
2360
|
+
shared_lock=True)
|
|
2361
|
+
# Wait for the exclusive lock to be released.
|
|
2362
|
+
shared_lock.acquire(blocking=True)
|
|
2363
|
+
# We only need the lock for signalling that the new tunnel has
|
|
2364
|
+
# been opened, not for checking the tunnel health.
|
|
2365
|
+
# Same reasoning as why we don't need to grab the lock in
|
|
2366
|
+
# the fast path at the start of this function.
|
|
2367
|
+
shared_lock.release()
|
|
2368
|
+
wait_elapsed = time.perf_counter() - start_time
|
|
2369
|
+
logger.debug(f'Acquired shared lock for {lock_id} after '
|
|
2370
|
+
f'{wait_elapsed:.2f}s')
|
|
2371
|
+
except locks.LockTimeout as e:
|
|
2372
|
+
raise RuntimeError(
|
|
2373
|
+
f'Failed to get gRPC channel for cluster '
|
|
2374
|
+
f'{self.cluster_name!r} due to a timeout when waiting '
|
|
2375
|
+
'for the SSH tunnel to be opened. Please try again or '
|
|
2376
|
+
f'manually remove the lock at {lock_id}. '
|
|
2377
|
+
f'{common_utils.format_exception(e)}') from e
|
|
2378
|
+
|
|
2379
|
+
# Add small jitter before probing to smoothen the effects
|
|
2380
|
+
# of many readers waking up simultaneously.
|
|
2381
|
+
jitter = random.uniform(0.01, 0.05)
|
|
2382
|
+
time.sleep(jitter)
|
|
2383
|
+
|
|
2384
|
+
# Re-read the tunnel metadata and verify it's healthy.
|
|
2385
|
+
tunnel = self._get_skylet_ssh_tunnel()
|
|
2386
|
+
if tunnel is not None:
|
|
2387
|
+
if _is_tunnel_healthy(tunnel):
|
|
2388
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2389
|
+
options=grpc_options)
|
|
2390
|
+
logger.debug('Failed to connect to SSH tunnel for cluster '
|
|
2391
|
+
f'{self.cluster_name!r} on port {tunnel.port}')
|
|
2392
|
+
# Tunnel is still unhealthy or missing, try again with updated
|
|
2393
|
+
# timeout. This could happen in the case where the thread who
|
|
2394
|
+
# held the exclusive lock to open the tunnel crashed.
|
|
2395
|
+
remaining_timeout = _get_remaining_timeout()
|
|
2396
|
+
attempt += 1
|
|
2397
|
+
raise RuntimeError('Timeout waiting for gRPC channel for cluster '
|
|
2398
|
+
f'{self.cluster_name!r} to be ready.')
|
|
2399
|
+
|
|
2400
|
+
def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
|
|
2401
|
+
"""Terminate the SSH tunnel process."""
|
|
2654
2402
|
try:
|
|
2655
2403
|
proc = psutil.Process(tunnel_info.pid)
|
|
2656
2404
|
if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
|
|
2657
2405
|
logger.debug(
|
|
2658
2406
|
f'Terminating SSH tunnel process {tunnel_info.pid}')
|
|
2659
|
-
proc.
|
|
2660
|
-
try:
|
|
2661
|
-
proc.wait(timeout=3)
|
|
2662
|
-
except psutil.TimeoutExpired:
|
|
2663
|
-
proc.kill()
|
|
2664
|
-
proc.wait(timeout=1)
|
|
2407
|
+
subprocess_utils.kill_children_processes(proc.pid)
|
|
2665
2408
|
except psutil.NoSuchProcess:
|
|
2666
2409
|
pass
|
|
2667
2410
|
except Exception as e: # pylint: disable=broad-except
|
|
2668
2411
|
logger.warning(
|
|
2669
2412
|
f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
|
|
2670
2413
|
|
|
2671
|
-
def
|
|
2414
|
+
def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
|
|
2672
2415
|
"""Opens an SSH tunnel to the Skylet on the head node,
|
|
2673
2416
|
updates the cluster handle, and persists it to the database."""
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2417
|
+
max_attempts = 3
|
|
2418
|
+
# There could be a race condition here, as multiple processes may
|
|
2419
|
+
# attempt to open the same port at the same time.
|
|
2420
|
+
for attempt in range(max_attempts):
|
|
2421
|
+
runners = self.get_command_runners()
|
|
2422
|
+
head_runner = runners[0]
|
|
2423
|
+
local_port = random.randint(10000, 65535)
|
|
2424
|
+
try:
|
|
2425
|
+
ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
|
|
2426
|
+
head_runner, (local_port, constants.SKYLET_GRPC_PORT))
|
|
2427
|
+
except exceptions.CommandError as e:
|
|
2428
|
+
# Don't retry if the error is due to timeout,
|
|
2429
|
+
# connection refused, Kubernetes pods not found,
|
|
2430
|
+
# or an in-progress termination.
|
|
2431
|
+
if (e.detailed_reason is not None and
|
|
2432
|
+
(backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
|
|
2433
|
+
e.detailed_reason) or
|
|
2434
|
+
backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
|
|
2435
|
+
e.detailed_reason) or attempt == max_attempts - 1)):
|
|
2436
|
+
raise e
|
|
2437
|
+
logger.warning(
|
|
2438
|
+
f'Failed to open SSH tunnel on port {local_port} '
|
|
2439
|
+
f'({attempt + 1}/{max_attempts}). '
|
|
2440
|
+
f'{e.error_msg}\n{e.detailed_reason}')
|
|
2441
|
+
continue
|
|
2442
|
+
tunnel_info = SSHTunnelInfo(port=local_port,
|
|
2443
|
+
pid=ssh_tunnel_proc.pid)
|
|
2444
|
+
break
|
|
2445
|
+
|
|
2687
2446
|
try:
|
|
2688
2447
|
grpc.channel_ready_future(
|
|
2689
2448
|
grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
|
|
2690
2449
|
timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
|
|
2691
2450
|
# Clean up existing tunnel before setting up the new one.
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2451
|
+
old_tunnel = self._get_skylet_ssh_tunnel()
|
|
2452
|
+
if old_tunnel is not None:
|
|
2453
|
+
self._terminate_ssh_tunnel_process(old_tunnel)
|
|
2454
|
+
self._set_skylet_ssh_tunnel(tunnel_info)
|
|
2455
|
+
return tunnel_info
|
|
2696
2456
|
except grpc.FutureTimeoutError as e:
|
|
2697
|
-
self.
|
|
2457
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2698
2458
|
logger.warning(
|
|
2699
2459
|
f'Skylet gRPC channel for cluster {self.cluster_name} not '
|
|
2700
2460
|
f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
|
|
2701
2461
|
raise e
|
|
2702
2462
|
except Exception as e:
|
|
2703
|
-
self.
|
|
2463
|
+
self._terminate_ssh_tunnel_process(tunnel_info)
|
|
2704
2464
|
raise e
|
|
2705
2465
|
|
|
2706
2466
|
@property
|
|
@@ -2713,6 +2473,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2713
2473
|
def cluster_yaml(self, value: Optional[str]):
|
|
2714
2474
|
self._cluster_yaml = value
|
|
2715
2475
|
|
|
2476
|
+
@property
|
|
2477
|
+
def instance_ids(self):
|
|
2478
|
+
if self.cached_cluster_info is not None:
|
|
2479
|
+
return self.cached_cluster_info.instance_ids()
|
|
2480
|
+
return None
|
|
2481
|
+
|
|
2716
2482
|
@property
|
|
2717
2483
|
def ssh_user(self):
|
|
2718
2484
|
if self.cached_cluster_info is not None:
|
|
@@ -2752,6 +2518,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2752
2518
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2753
2519
|
return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
|
|
2754
2520
|
|
|
2521
|
+
def __getstate__(self):
|
|
2522
|
+
state = self.__dict__.copy()
|
|
2523
|
+
# For backwards compatibility. Refer to
|
|
2524
|
+
# https://github.com/skypilot-org/skypilot/pull/7133
|
|
2525
|
+
state.setdefault('skylet_ssh_tunnel', None)
|
|
2526
|
+
return state
|
|
2527
|
+
|
|
2755
2528
|
def __setstate__(self, state):
|
|
2756
2529
|
self._version = self._VERSION
|
|
2757
2530
|
|
|
@@ -2809,6 +2582,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2809
2582
|
state['is_grpc_enabled'] = False
|
|
2810
2583
|
state['skylet_ssh_tunnel'] = None
|
|
2811
2584
|
|
|
2585
|
+
if version >= 12:
|
|
2586
|
+
# DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
|
|
2587
|
+
state.pop('skylet_ssh_tunnel', None)
|
|
2588
|
+
|
|
2812
2589
|
self.__dict__.update(state)
|
|
2813
2590
|
|
|
2814
2591
|
# Because the update_cluster_ips and update_ssh_ports
|
|
@@ -2886,21 +2663,180 @@ class SkyletClient:
|
|
|
2886
2663
|
|
|
2887
2664
|
def __init__(self, channel: 'grpc.Channel'):
|
|
2888
2665
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
2666
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
2667
|
+
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
2668
|
+
self._managed_jobs_stub = (
|
|
2669
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
2889
2670
|
|
|
2890
2671
|
def set_autostop(
|
|
2891
2672
|
self,
|
|
2892
2673
|
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
2893
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2674
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2894
2675
|
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
2895
2676
|
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
2896
2677
|
|
|
2897
2678
|
def is_autostopping(
|
|
2898
2679
|
self,
|
|
2899
2680
|
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
2900
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2681
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2901
2682
|
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
2902
2683
|
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
2903
2684
|
|
|
2685
|
+
def add_job(
|
|
2686
|
+
self,
|
|
2687
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
2688
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2689
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
2690
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
2691
|
+
|
|
2692
|
+
def queue_job(
|
|
2693
|
+
self,
|
|
2694
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
2695
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2696
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
2697
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
2698
|
+
|
|
2699
|
+
def update_status(
|
|
2700
|
+
self,
|
|
2701
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
2702
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2703
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
2704
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
2705
|
+
|
|
2706
|
+
def get_job_queue(
|
|
2707
|
+
self,
|
|
2708
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
2709
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2710
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
2711
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
2712
|
+
|
|
2713
|
+
def cancel_jobs(
|
|
2714
|
+
self,
|
|
2715
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
2716
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2717
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
2718
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2719
|
+
|
|
2720
|
+
def fail_all_in_progress_jobs(
|
|
2721
|
+
self,
|
|
2722
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
2723
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2724
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
2725
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
2726
|
+
|
|
2727
|
+
def get_job_status(
|
|
2728
|
+
self,
|
|
2729
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
2730
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2731
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
2732
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
2733
|
+
|
|
2734
|
+
def get_job_submitted_timestamp(
|
|
2735
|
+
self,
|
|
2736
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
2737
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2738
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
2739
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
2740
|
+
timeout=timeout)
|
|
2741
|
+
|
|
2742
|
+
def get_job_ended_timestamp(
|
|
2743
|
+
self,
|
|
2744
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
2745
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2746
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
2747
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
2748
|
+
|
|
2749
|
+
def get_log_dirs_for_jobs(
|
|
2750
|
+
self,
|
|
2751
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
2752
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2753
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
2754
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
2755
|
+
|
|
2756
|
+
def tail_logs(
|
|
2757
|
+
self,
|
|
2758
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
2759
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2760
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
2761
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
2762
|
+
|
|
2763
|
+
def get_service_status(
|
|
2764
|
+
self,
|
|
2765
|
+
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
2766
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2767
|
+
) -> 'servev1_pb2.GetServiceStatusResponse':
|
|
2768
|
+
return self._serve_stub.GetServiceStatus(request, timeout=timeout)
|
|
2769
|
+
|
|
2770
|
+
def add_serve_version(
|
|
2771
|
+
self,
|
|
2772
|
+
request: 'servev1_pb2.AddVersionRequest',
|
|
2773
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2774
|
+
) -> 'servev1_pb2.AddVersionResponse':
|
|
2775
|
+
return self._serve_stub.AddVersion(request, timeout=timeout)
|
|
2776
|
+
|
|
2777
|
+
def terminate_services(
|
|
2778
|
+
self,
|
|
2779
|
+
request: 'servev1_pb2.TerminateServicesRequest',
|
|
2780
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2781
|
+
) -> 'servev1_pb2.TerminateServicesResponse':
|
|
2782
|
+
return self._serve_stub.TerminateServices(request, timeout=timeout)
|
|
2783
|
+
|
|
2784
|
+
def terminate_replica(
|
|
2785
|
+
self,
|
|
2786
|
+
request: 'servev1_pb2.TerminateReplicaRequest',
|
|
2787
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2788
|
+
) -> 'servev1_pb2.TerminateReplicaResponse':
|
|
2789
|
+
return self._serve_stub.TerminateReplica(request, timeout=timeout)
|
|
2790
|
+
|
|
2791
|
+
def wait_service_registration(
|
|
2792
|
+
self,
|
|
2793
|
+
request: 'servev1_pb2.WaitServiceRegistrationRequest',
|
|
2794
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2795
|
+
) -> 'servev1_pb2.WaitServiceRegistrationResponse':
|
|
2796
|
+
# set timeout to at least 10 seconds more than service register
|
|
2797
|
+
# constant to make sure that timeouts will not occur.
|
|
2798
|
+
if timeout is not None:
|
|
2799
|
+
timeout = max(timeout,
|
|
2800
|
+
serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
|
|
2801
|
+
return self._serve_stub.WaitServiceRegistration(request,
|
|
2802
|
+
timeout=timeout)
|
|
2803
|
+
|
|
2804
|
+
def update_service(
|
|
2805
|
+
self,
|
|
2806
|
+
request: 'servev1_pb2.UpdateServiceRequest',
|
|
2807
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2808
|
+
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
2809
|
+
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
2810
|
+
|
|
2811
|
+
def get_managed_job_controller_version(
|
|
2812
|
+
self,
|
|
2813
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
2814
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2815
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
2816
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
2817
|
+
|
|
2818
|
+
def get_managed_job_table(
|
|
2819
|
+
self,
|
|
2820
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
2821
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2822
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
2823
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
2824
|
+
|
|
2825
|
+
def get_all_managed_job_ids_by_name(
|
|
2826
|
+
self,
|
|
2827
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
2828
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2829
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
2830
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
2831
|
+
timeout=timeout)
|
|
2832
|
+
|
|
2833
|
+
def cancel_managed_jobs(
|
|
2834
|
+
self,
|
|
2835
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
2836
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2837
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
2838
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
2839
|
+
|
|
2904
2840
|
|
|
2905
2841
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
2906
2842
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -2931,6 +2867,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2931
2867
|
self._requested_features = set()
|
|
2932
2868
|
self._dump_final_script = False
|
|
2933
2869
|
self._is_managed = False
|
|
2870
|
+
# Optional planner (via register_info): used under the per-cluster lock
|
|
2871
|
+
# to produce a fresh concrete plan when neither a reusable snapshot nor
|
|
2872
|
+
# a caller plan is available.
|
|
2873
|
+
self._planner = None
|
|
2934
2874
|
|
|
2935
2875
|
# Command for running the setup script. It is only set when the
|
|
2936
2876
|
# setup needs to be run outside the self._setup() and as part of
|
|
@@ -2948,6 +2888,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2948
2888
|
self._requested_features)
|
|
2949
2889
|
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
|
2950
2890
|
self._is_managed = kwargs.pop('is_managed', False)
|
|
2891
|
+
# Optional planner callback for a fresh plan under lock when no
|
|
2892
|
+
# reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
|
|
2893
|
+
self._planner = kwargs.pop('planner', self._planner)
|
|
2951
2894
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
|
2952
2895
|
|
|
2953
2896
|
def check_resources_fit_cluster(
|
|
@@ -2974,9 +2917,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2974
2917
|
# Usage Collection:
|
|
2975
2918
|
usage_lib.messages.usage.update_cluster_resources(
|
|
2976
2919
|
handle.launched_nodes, launched_resources)
|
|
2977
|
-
|
|
2978
|
-
if
|
|
2979
|
-
usage_lib.messages.usage.update_cluster_status(
|
|
2920
|
+
status = global_user_state.get_status_from_cluster_name(cluster_name)
|
|
2921
|
+
if status is not None:
|
|
2922
|
+
usage_lib.messages.usage.update_cluster_status(status)
|
|
2980
2923
|
|
|
2981
2924
|
assert launched_resources.region is not None, handle
|
|
2982
2925
|
|
|
@@ -3115,7 +3058,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3115
3058
|
colorama.Style.RESET_ALL +
|
|
3116
3059
|
colorama.Style.DIM +
|
|
3117
3060
|
'Check concurrent requests: ' +
|
|
3118
|
-
'sky api status '
|
|
3061
|
+
'sky api status -v | grep '
|
|
3062
|
+
f'{cluster_name}'))
|
|
3119
3063
|
|
|
3120
3064
|
def _locked_provision(
|
|
3121
3065
|
self,
|
|
@@ -3172,8 +3116,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3172
3116
|
try:
|
|
3173
3117
|
retry_provisioner = RetryingVmProvisioner(
|
|
3174
3118
|
self.log_dir,
|
|
3175
|
-
self._dag,
|
|
3176
|
-
self._optimize_target,
|
|
3119
|
+
self._dag, # type: ignore[arg-type]
|
|
3120
|
+
self._optimize_target, # type: ignore[arg-type]
|
|
3177
3121
|
self._requested_features,
|
|
3178
3122
|
local_wheel_path,
|
|
3179
3123
|
wheel_hash,
|
|
@@ -3204,9 +3148,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3204
3148
|
gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
|
|
3205
3149
|
retry_message = ux_utils.retry_message(
|
|
3206
3150
|
f'Retry after {gap_seconds:.0f}s ')
|
|
3207
|
-
hint_message = (
|
|
3208
|
-
|
|
3209
|
-
|
|
3151
|
+
hint_message = (
|
|
3152
|
+
f'\n{retry_message} '
|
|
3153
|
+
f'{ux_utils.provision_hint(cluster_name)}'
|
|
3154
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3210
3155
|
|
|
3211
3156
|
# Add cluster event for retry.
|
|
3212
3157
|
global_user_state.add_cluster_event(
|
|
@@ -3235,7 +3180,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3235
3180
|
logger.error(
|
|
3236
3181
|
ux_utils.error_message(
|
|
3237
3182
|
'Failed to provision resources. '
|
|
3238
|
-
f'{ux_utils.
|
|
3183
|
+
f'{ux_utils.provision_hint(cluster_name)}'))
|
|
3239
3184
|
error_message += (
|
|
3240
3185
|
'\nTo keep retrying until the cluster is up, use '
|
|
3241
3186
|
'the `--retry-until-up` flag.')
|
|
@@ -3244,8 +3189,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3244
3189
|
error_message + '\n' + str(e),
|
|
3245
3190
|
failover_history=e.failover_history) from None
|
|
3246
3191
|
if dryrun:
|
|
3247
|
-
|
|
3248
|
-
|
|
3192
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3193
|
+
cluster_name)
|
|
3194
|
+
return handle if handle is not None else None, False
|
|
3249
3195
|
|
|
3250
3196
|
if config_dict['provisioning_skipped']:
|
|
3251
3197
|
# Skip further provisioning.
|
|
@@ -3253,10 +3199,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3253
3199
|
# ('handle', 'provision_record', 'resources_vars')
|
|
3254
3200
|
# We need to return the handle - but it should be the existing
|
|
3255
3201
|
# handle for the cluster.
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
return
|
|
3202
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
3203
|
+
cluster_name)
|
|
3204
|
+
assert handle is not None, (cluster_name, handle)
|
|
3205
|
+
return handle, True
|
|
3260
3206
|
|
|
3261
3207
|
if 'provision_record' in config_dict:
|
|
3262
3208
|
# New provisioner is used here.
|
|
@@ -3279,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3279
3225
|
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3280
3226
|
|
|
3281
3227
|
cluster_info = provisioner.post_provision_runtime_setup(
|
|
3282
|
-
|
|
3228
|
+
handle.launched_resources,
|
|
3283
3229
|
resources_utils.ClusterName(handle.cluster_name,
|
|
3284
3230
|
handle.cluster_name_on_cloud),
|
|
3285
3231
|
handle.cluster_yaml,
|
|
@@ -3293,6 +3239,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3293
3239
|
# manually or by the cloud provider.
|
|
3294
3240
|
# Optimize the case where the cluster's IPs can be retrieved
|
|
3295
3241
|
# from cluster_info.
|
|
3242
|
+
handle.cached_cluster_info = cluster_info
|
|
3296
3243
|
handle.docker_user = cluster_info.docker_user
|
|
3297
3244
|
handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
|
|
3298
3245
|
cluster_info=cluster_info)
|
|
@@ -3304,7 +3251,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3304
3251
|
|
|
3305
3252
|
self._update_after_cluster_provisioned(
|
|
3306
3253
|
handle, to_provision_config.prev_handle, task,
|
|
3307
|
-
prev_cluster_status,
|
|
3254
|
+
prev_cluster_status, config_hash)
|
|
3308
3255
|
return handle, False
|
|
3309
3256
|
|
|
3310
3257
|
cluster_config_file = config_dict['ray']
|
|
@@ -3376,7 +3323,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3376
3323
|
|
|
3377
3324
|
self._update_after_cluster_provisioned(
|
|
3378
3325
|
handle, to_provision_config.prev_handle, task,
|
|
3379
|
-
prev_cluster_status,
|
|
3326
|
+
prev_cluster_status, config_hash)
|
|
3380
3327
|
return handle, False
|
|
3381
3328
|
|
|
3382
3329
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -3394,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3394
3341
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3395
3342
|
task: task_lib.Task,
|
|
3396
3343
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3397
|
-
|
|
3344
|
+
config_hash: str) -> None:
|
|
3398
3345
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3399
3346
|
handle.launched_nodes, handle.launched_resources)
|
|
3400
3347
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3406,16 +3353,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3406
3353
|
# update_status will query the ray job status for all INIT /
|
|
3407
3354
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3408
3355
|
# know the actual previous status of the cluster.
|
|
3409
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3410
3356
|
logger.debug('Update job queue on remote cluster.')
|
|
3411
3357
|
with rich_utils.safe_status(
|
|
3412
3358
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3359
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3360
|
+
|
|
3361
|
+
if not use_legacy:
|
|
3362
|
+
try:
|
|
3363
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3364
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3365
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3366
|
+
).update_status(request))
|
|
3367
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3368
|
+
use_legacy = True
|
|
3369
|
+
|
|
3370
|
+
if use_legacy:
|
|
3371
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3372
|
+
returncode, _, stderr = self.run_on_head(
|
|
3373
|
+
handle, cmd, require_outputs=True)
|
|
3374
|
+
subprocess_utils.handle_returncode(
|
|
3375
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3419
3376
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3420
3377
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3421
3378
|
# is restarted
|
|
@@ -3423,14 +3380,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3423
3380
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3424
3381
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3425
3382
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3383
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3384
|
+
|
|
3385
|
+
if not use_legacy:
|
|
3386
|
+
try:
|
|
3387
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3388
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3389
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3390
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3391
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3392
|
+
use_legacy = True
|
|
3393
|
+
|
|
3394
|
+
if use_legacy:
|
|
3395
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3396
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3397
|
+
handle, cmd, require_outputs=True)
|
|
3398
|
+
subprocess_utils.handle_returncode(
|
|
3399
|
+
returncode, cmd,
|
|
3400
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3401
|
+
stdout + stderr)
|
|
3434
3402
|
|
|
3435
3403
|
prev_ports = None
|
|
3436
3404
|
if prev_handle is not None:
|
|
@@ -3485,8 +3453,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3485
3453
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3486
3454
|
handle.ssh_user)
|
|
3487
3455
|
|
|
3488
|
-
locks.get_lock(lock_id).force_unlock()
|
|
3489
|
-
|
|
3490
3456
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3491
3457
|
workdir: Union[Path, Dict[str, Any]],
|
|
3492
3458
|
envs_and_secrets: Dict[str, str]) -> None:
|
|
@@ -3618,6 +3584,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3618
3584
|
self._set_storage_mounts_metadata(handle.cluster_name,
|
|
3619
3585
|
storage_mounts)
|
|
3620
3586
|
|
|
3587
|
+
def _get_num_gpus(self, task: task_lib.Task) -> int:
|
|
3588
|
+
if task.resources is not None:
|
|
3589
|
+
for resource in task.resources:
|
|
3590
|
+
if (resource.accelerators is not None and
|
|
3591
|
+
isinstance(resource.accelerators, dict)):
|
|
3592
|
+
if len(resource.accelerators) > 0:
|
|
3593
|
+
return math.ceil(
|
|
3594
|
+
list(resource.accelerators.values())[0])
|
|
3595
|
+
return 0
|
|
3596
|
+
|
|
3621
3597
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
|
3622
3598
|
detach_setup: bool) -> None:
|
|
3623
3599
|
start = time.time()
|
|
@@ -3630,13 +3606,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3630
3606
|
remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
|
|
3631
3607
|
# Need this `-i` option to make sure `source ~/.bashrc` work
|
|
3632
3608
|
setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
|
|
3609
|
+
unset_ray_env_vars = ' && '.join(
|
|
3610
|
+
[f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
|
|
3611
|
+
setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
|
|
3633
3612
|
runners = handle.get_command_runners(avoid_ssh_control=True)
|
|
3634
3613
|
|
|
3635
3614
|
def _setup_node(node_id: int) -> None:
|
|
3636
|
-
setup_envs =
|
|
3615
|
+
setup_envs = task_lib.get_plaintext_envs_and_secrets(
|
|
3616
|
+
task.envs_and_secrets)
|
|
3637
3617
|
setup_envs.update(self._skypilot_predefined_env_vars(handle))
|
|
3638
3618
|
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
|
|
3639
3619
|
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
|
|
3620
|
+
setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
|
|
3621
|
+
self._get_num_gpus(task)))
|
|
3622
|
+
|
|
3640
3623
|
runner = runners[node_id]
|
|
3641
3624
|
setup_script = log_lib.make_task_bash_script(setup,
|
|
3642
3625
|
env_vars=setup_envs)
|
|
@@ -3693,29 +3676,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3693
3676
|
|
|
3694
3677
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
|
3695
3678
|
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
encoding='utf-8') as f:
|
|
3701
|
-
return match_str.lower() in f.read().lower()
|
|
3702
|
-
except Exception as e: # pylint: disable=broad-except
|
|
3703
|
-
# We don't crash the setup if we cannot read the log file.
|
|
3704
|
-
# Instead, we should retry the setup with dumping the script
|
|
3705
|
-
# to a file to be safe.
|
|
3706
|
-
logger.debug(
|
|
3707
|
-
f'Failed to read setup log file {setup_log_path}: {e}')
|
|
3708
|
-
return True
|
|
3709
|
-
|
|
3710
|
-
if ((returncode == 255 and _load_setup_log_and_match('too long')) or
|
|
3711
|
-
(returncode == 1 and
|
|
3712
|
-
_load_setup_log_and_match('request-uri too large'))):
|
|
3713
|
-
# If the setup script is too long, we retry it with dumping
|
|
3714
|
-
# the script to a file and running it with SSH. We use a
|
|
3715
|
-
# general length limit check before but it could be
|
|
3716
|
-
# inaccurate on some systems.
|
|
3717
|
-
# When there is a cloudflare proxy in front of the remote, it
|
|
3718
|
-
# could cause `414 Request-URI Too Large` error.
|
|
3679
|
+
if _is_message_too_long(returncode, file_path=setup_log_path):
|
|
3680
|
+
# If the setup script is too long, we need to retry it
|
|
3681
|
+
# with dumping the script to a file and running it the script
|
|
3682
|
+
# on remote cluster instead.
|
|
3719
3683
|
logger.debug('Failed to run setup command inline due to '
|
|
3720
3684
|
'command length limit. Dumping setup script to '
|
|
3721
3685
|
'file and running it with SSH.')
|
|
@@ -3779,119 +3743,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3779
3743
|
logger.info(
|
|
3780
3744
|
ux_utils.finishing_message('Setup completed.', setup_log_path))
|
|
3781
3745
|
|
|
3746
|
+
def _download_file(self, handle: CloudVmRayResourceHandle,
|
|
3747
|
+
local_file_path: str, remote_file_path: str) -> None:
|
|
3748
|
+
"""Syncs file from remote to local."""
|
|
3749
|
+
runners = handle.get_command_runners()
|
|
3750
|
+
head_runner = runners[0]
|
|
3751
|
+
head_runner.rsync(
|
|
3752
|
+
source=local_file_path,
|
|
3753
|
+
target=remote_file_path,
|
|
3754
|
+
up=False,
|
|
3755
|
+
stream_logs=False,
|
|
3756
|
+
)
|
|
3757
|
+
|
|
3782
3758
|
def _exec_code_on_head(
|
|
3783
3759
|
self,
|
|
3784
3760
|
handle: CloudVmRayResourceHandle,
|
|
3785
3761
|
codegen: str,
|
|
3786
3762
|
job_id: int,
|
|
3787
|
-
detach_run: bool = False,
|
|
3788
3763
|
managed_job_dag: Optional['dag.Dag'] = None,
|
|
3764
|
+
managed_job_user_id: Optional[str] = None,
|
|
3789
3765
|
remote_log_dir: Optional[str] = None,
|
|
3790
3766
|
) -> None:
|
|
3791
3767
|
"""Executes generated code on the head node."""
|
|
3792
|
-
|
|
3768
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3769
|
+
file_name = f'sky_job_{job_id}'
|
|
3770
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
3793
3771
|
if remote_log_dir is None:
|
|
3794
3772
|
remote_log_dir = self.log_dir
|
|
3795
3773
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3796
3774
|
|
|
3797
|
-
|
|
3775
|
+
def _dump_code_to_file(codegen: str,
|
|
3776
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3777
|
+
runners = handle.get_command_runners()
|
|
3778
|
+
head_runner = runners[0]
|
|
3779
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3780
|
+
fp.write(codegen)
|
|
3781
|
+
fp.flush()
|
|
3782
|
+
script_path = os.path.join(target_dir, file_name)
|
|
3783
|
+
# We choose to sync code + exec, because the alternative of
|
|
3784
|
+
# 'ray submit' may not work as it may use system python
|
|
3785
|
+
# (python2) to execute the script. Happens for AWS.
|
|
3786
|
+
head_runner.rsync(source=fp.name,
|
|
3787
|
+
target=script_path,
|
|
3788
|
+
up=True,
|
|
3789
|
+
stream_logs=False)
|
|
3798
3790
|
|
|
3791
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3799
3792
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3800
3793
|
f'touch {remote_log_path}')
|
|
3801
3794
|
encoded_script = shlex.quote(codegen)
|
|
3802
3795
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3803
3796
|
job_submit_cmd = (
|
|
3804
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3805
|
-
# with pid is the same driver process.
|
|
3797
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3798
|
+
# retrieved with pid is the same driver process.
|
|
3806
3799
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3807
3800
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3808
3801
|
# Do not use &>, which is not POSIX and may not work.
|
|
3809
3802
|
# Note that the order of ">filename 2>&1" matters.
|
|
3810
3803
|
f'> {remote_log_path} 2>&1')
|
|
3811
|
-
|
|
3812
3804
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3813
3805
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3814
3806
|
|
|
3815
|
-
def _dump_code_to_file(codegen: str,
|
|
3816
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3817
|
-
runners = handle.get_command_runners()
|
|
3818
|
-
head_runner = runners[0]
|
|
3819
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3820
|
-
fp.write(codegen)
|
|
3821
|
-
fp.flush()
|
|
3822
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3823
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3824
|
-
# submit' may not work as it may use system python (python2) to
|
|
3825
|
-
# execute the script. Happens for AWS.
|
|
3826
|
-
head_runner.rsync(source=fp.name,
|
|
3827
|
-
target=script_path,
|
|
3828
|
-
up=True,
|
|
3829
|
-
stream_logs=False)
|
|
3830
|
-
|
|
3831
3807
|
# Should also be ealier than _is_command_length_over_limit
|
|
3832
3808
|
# Same reason as in _setup
|
|
3833
3809
|
if self._dump_final_script:
|
|
3834
3810
|
_dump_code_to_file(job_submit_cmd,
|
|
3835
3811
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3836
3812
|
|
|
3837
|
-
if
|
|
3838
|
-
|
|
3839
|
-
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
3850
|
-
|
|
3851
|
-
|
|
3852
|
-
|
|
3853
|
-
|
|
3854
|
-
|
|
3855
|
-
|
|
3856
|
-
|
|
3857
|
-
|
|
3858
|
-
|
|
3859
|
-
|
|
3860
|
-
|
|
3861
|
-
|
|
3813
|
+
if not use_legacy:
|
|
3814
|
+
try:
|
|
3815
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
3816
|
+
if managed_job_dag is not None:
|
|
3817
|
+
workspace = skypilot_config.get_active_workspace(
|
|
3818
|
+
force_user_workspace=True)
|
|
3819
|
+
entrypoint = common_utils.get_current_command()
|
|
3820
|
+
|
|
3821
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
3822
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
3823
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
3824
|
+
task, is_managed_job=True)
|
|
3825
|
+
managed_job_tasks.append(
|
|
3826
|
+
jobsv1_pb2.ManagedJobTask(
|
|
3827
|
+
task_id=task_id,
|
|
3828
|
+
name=task.name,
|
|
3829
|
+
resources_str=resources_str,
|
|
3830
|
+
metadata_json=task.metadata_json))
|
|
3831
|
+
|
|
3832
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
3833
|
+
name=managed_job_dag.name,
|
|
3834
|
+
pool=managed_job_dag.pool,
|
|
3835
|
+
workspace=workspace,
|
|
3836
|
+
entrypoint=entrypoint,
|
|
3837
|
+
tasks=managed_job_tasks,
|
|
3838
|
+
user_id=managed_job_user_id)
|
|
3839
|
+
|
|
3840
|
+
if _is_command_length_over_limit(codegen):
|
|
3841
|
+
_dump_code_to_file(codegen)
|
|
3842
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3843
|
+
job_id=job_id,
|
|
3844
|
+
# codegen not set - server assumes script uploaded
|
|
3845
|
+
remote_log_dir=remote_log_dir,
|
|
3846
|
+
managed_job=managed_job_info,
|
|
3847
|
+
script_path=script_path)
|
|
3848
|
+
else:
|
|
3849
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3850
|
+
job_id=job_id,
|
|
3851
|
+
codegen=codegen,
|
|
3852
|
+
remote_log_dir=remote_log_dir,
|
|
3853
|
+
managed_job=managed_job_info,
|
|
3854
|
+
script_path=script_path)
|
|
3855
|
+
|
|
3856
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
3857
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
3858
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3859
|
+
use_legacy = True
|
|
3860
|
+
|
|
3861
|
+
if use_legacy:
|
|
3862
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
|
3863
|
+
_dump_code_to_file(codegen)
|
|
3864
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3865
|
+
|
|
3866
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
3867
|
+
if managed_job_dag is not None:
|
|
3868
|
+
# Add the managed job to job queue database.
|
|
3869
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
3870
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
3871
|
+
job_id,
|
|
3872
|
+
managed_job_dag,
|
|
3873
|
+
skypilot_config.get_active_workspace(
|
|
3874
|
+
force_user_workspace=True),
|
|
3875
|
+
entrypoint=common_utils.get_current_command(),
|
|
3876
|
+
user_hash=managed_job_user_id)
|
|
3877
|
+
# Set the managed job to PENDING state to make sure that
|
|
3878
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
3879
|
+
# if it needs to wait to be submitted.
|
|
3880
|
+
# We cannot set the managed job to PENDING state in the
|
|
3881
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
3882
|
+
# to wait for the run commands to be scheduled on the job
|
|
3883
|
+
# controller in high-load cases.
|
|
3884
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
3885
|
+
return job_submit_cmd
|
|
3862
3886
|
|
|
3863
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3864
|
-
job_submit_cmd,
|
|
3865
|
-
stream_logs=False,
|
|
3866
|
-
require_outputs=True)
|
|
3867
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3868
|
-
# running a job. Necessitating calling `sky launch`.
|
|
3869
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3870
|
-
handle.cluster_name)
|
|
3871
|
-
output = stdout + stderr
|
|
3872
|
-
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
3873
|
-
(returncode == 1 and 'request-uri too large' in output.lower())):
|
|
3874
|
-
# If the generated script is too long, we retry it with dumping
|
|
3875
|
-
# the script to a file and running it with SSH. We use a general
|
|
3876
|
-
# length limit check before but it could be inaccurate on some
|
|
3877
|
-
# systems.
|
|
3878
|
-
# When there is a cloudflare proxy in front of the remote, it could
|
|
3879
|
-
# cause `414 Request-URI Too Large` error.
|
|
3880
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
3881
|
-
'Dumping job to file and running it with SSH. '
|
|
3882
|
-
f'Output: {output}')
|
|
3883
|
-
_dump_code_to_file(codegen)
|
|
3884
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3885
3887
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3888
|
+
|
|
3886
3889
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
3887
3890
|
job_submit_cmd,
|
|
3888
3891
|
stream_logs=False,
|
|
3889
3892
|
require_outputs=True)
|
|
3893
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3894
|
+
# running a job. Necessitating calling `sky launch`.
|
|
3895
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3896
|
+
handle.cluster_name)
|
|
3897
|
+
output = stdout + stderr
|
|
3898
|
+
if _is_message_too_long(returncode, output=output):
|
|
3899
|
+
# If the job submit script is too long, we need to retry it
|
|
3900
|
+
# with dumping the script to a file and running it the script
|
|
3901
|
+
# on remote cluster instead.
|
|
3902
|
+
logger.debug(
|
|
3903
|
+
'Failed to submit job due to command length limit. '
|
|
3904
|
+
'Dumping job to file and running it with SSH. '
|
|
3905
|
+
f'Output: {output}')
|
|
3906
|
+
_dump_code_to_file(codegen)
|
|
3907
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3908
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3909
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3910
|
+
handle,
|
|
3911
|
+
job_submit_cmd,
|
|
3912
|
+
stream_logs=False,
|
|
3913
|
+
require_outputs=True)
|
|
3890
3914
|
|
|
3891
|
-
|
|
3892
|
-
|
|
3893
|
-
|
|
3894
|
-
|
|
3915
|
+
subprocess_utils.handle_returncode(
|
|
3916
|
+
returncode,
|
|
3917
|
+
job_submit_cmd,
|
|
3918
|
+
f'Failed to submit job {job_id}.',
|
|
3919
|
+
stderr=stdout + stderr)
|
|
3895
3920
|
|
|
3896
3921
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
3897
3922
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -3900,61 +3925,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3900
3925
|
logger.info(
|
|
3901
3926
|
ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
|
|
3902
3927
|
rich_utils.stop_safe_status()
|
|
3903
|
-
if not detach_run:
|
|
3904
|
-
if (handle.cluster_name == controller_utils.Controllers.
|
|
3905
|
-
JOBS_CONTROLLER.value.cluster_name):
|
|
3906
|
-
self.tail_managed_job_logs(handle, job_id)
|
|
3907
|
-
else:
|
|
3908
|
-
# Sky logs. Not using subprocess.run since it will make the
|
|
3909
|
-
# ssh keep connected after ctrl-c.
|
|
3910
|
-
self.tail_logs(handle, job_id)
|
|
3911
3928
|
|
|
3912
3929
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
3913
3930
|
job_name: Optional[str], resources_str: str,
|
|
3914
3931
|
metadata: str) -> Tuple[int, str]:
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
|
|
3922
|
-
|
|
3923
|
-
|
|
3924
|
-
|
|
3925
|
-
|
|
3926
|
-
|
|
3927
|
-
|
|
3928
|
-
|
|
3929
|
-
|
|
3930
|
-
|
|
3931
|
-
|
|
3932
|
-
|
|
3933
|
-
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
|
|
3942
|
-
|
|
3943
|
-
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
3947
|
-
|
|
3948
|
-
|
|
3949
|
-
|
|
3950
|
-
|
|
3932
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3933
|
+
|
|
3934
|
+
if not use_legacy:
|
|
3935
|
+
try:
|
|
3936
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
3937
|
+
job_name=job_name,
|
|
3938
|
+
username=common_utils.get_user_hash(),
|
|
3939
|
+
run_timestamp=self.run_timestamp,
|
|
3940
|
+
resources_str=resources_str,
|
|
3941
|
+
metadata=metadata)
|
|
3942
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
3943
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
3944
|
+
request))
|
|
3945
|
+
job_id = response.job_id
|
|
3946
|
+
log_dir = response.log_dir
|
|
3947
|
+
return job_id, log_dir
|
|
3948
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3949
|
+
use_legacy = True
|
|
3950
|
+
|
|
3951
|
+
if use_legacy:
|
|
3952
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
3953
|
+
job_name=job_name,
|
|
3954
|
+
username=common_utils.get_user_hash(),
|
|
3955
|
+
run_timestamp=self.run_timestamp,
|
|
3956
|
+
resources_str=resources_str,
|
|
3957
|
+
metadata=metadata)
|
|
3958
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
3959
|
+
handle,
|
|
3960
|
+
code,
|
|
3961
|
+
stream_logs=False,
|
|
3962
|
+
require_outputs=True,
|
|
3963
|
+
separate_stderr=True)
|
|
3964
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3965
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
3966
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
3967
|
+
handle.cluster_name)
|
|
3968
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
3969
|
+
# retry for this, after we figure out the reason.
|
|
3970
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
3971
|
+
'Failed to fetch job id.',
|
|
3972
|
+
stderr)
|
|
3973
|
+
try:
|
|
3974
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
3975
|
+
if job_id_match is not None:
|
|
3976
|
+
job_id = int(job_id_match.group(1))
|
|
3977
|
+
else:
|
|
3978
|
+
# For backward compatibility.
|
|
3979
|
+
job_id = int(result_str)
|
|
3980
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
3981
|
+
if log_dir_match is not None:
|
|
3982
|
+
log_dir = log_dir_match.group(1).strip()
|
|
3983
|
+
else:
|
|
3984
|
+
# For backward compatibility, use the same log dir as local.
|
|
3985
|
+
log_dir = self.log_dir
|
|
3986
|
+
except ValueError as e:
|
|
3987
|
+
logger.error(stderr)
|
|
3988
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
3989
|
+
f'Returncode: {returncode}') from e
|
|
3951
3990
|
return job_id, log_dir
|
|
3952
3991
|
|
|
3953
3992
|
def _execute(
|
|
3954
3993
|
self,
|
|
3955
3994
|
handle: CloudVmRayResourceHandle,
|
|
3956
3995
|
task: task_lib.Task,
|
|
3957
|
-
detach_run: bool,
|
|
3958
3996
|
dryrun: bool = False,
|
|
3959
3997
|
) -> Optional[int]:
|
|
3960
3998
|
"""Executes the task on the cluster.
|
|
@@ -4006,12 +4044,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4006
4044
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
4007
4045
|
# Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
|
|
4008
4046
|
if num_actual_nodes > 1:
|
|
4009
|
-
self._execute_task_n_nodes(handle, task_copy, job_id,
|
|
4010
|
-
log_dir)
|
|
4047
|
+
self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
|
|
4011
4048
|
else:
|
|
4012
4049
|
# Case: task_lib.Task(run, num_nodes=1)
|
|
4013
|
-
self._execute_task_one_node(handle, task_copy, job_id,
|
|
4014
|
-
log_dir)
|
|
4050
|
+
self._execute_task_one_node(handle, task_copy, job_id, log_dir)
|
|
4015
4051
|
|
|
4016
4052
|
return job_id
|
|
4017
4053
|
|
|
@@ -4054,7 +4090,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4054
4090
|
is_identity_mismatch_and_purge = False
|
|
4055
4091
|
try:
|
|
4056
4092
|
backend_utils.check_owner_identity(cluster_name)
|
|
4057
|
-
except exceptions.ClusterOwnerIdentityMismatchError
|
|
4093
|
+
except (exceptions.ClusterOwnerIdentityMismatchError,
|
|
4094
|
+
exceptions.CloudUserIdentityError) as e:
|
|
4058
4095
|
if purge:
|
|
4059
4096
|
logger.error(e)
|
|
4060
4097
|
verbed = 'terminated' if terminate else 'stopped'
|
|
@@ -4068,15 +4105,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4068
4105
|
else:
|
|
4069
4106
|
raise
|
|
4070
4107
|
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
4071
|
-
lock = locks.get_lock(lock_id)
|
|
4108
|
+
lock = locks.get_lock(lock_id, timeout=1)
|
|
4072
4109
|
# Retry in case new cluster operation comes in and holds the lock
|
|
4073
4110
|
# right after the lock is removed.
|
|
4074
4111
|
n_attempts = 2
|
|
4075
4112
|
while True:
|
|
4076
4113
|
n_attempts -= 1
|
|
4077
|
-
# In case other running cluster operations are still holding the
|
|
4078
|
-
# lock.
|
|
4079
|
-
lock.force_unlock()
|
|
4080
4114
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
4081
4115
|
# should be higher priority than the cluster requests, and we should
|
|
4082
4116
|
# release the lock from other requests.
|
|
@@ -4094,6 +4128,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4094
4128
|
'Failed to kill other launch requests for the '
|
|
4095
4129
|
f'cluster {handle.cluster_name}: '
|
|
4096
4130
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4131
|
+
# In case other running cluster operations are still holding the
|
|
4132
|
+
# lock.
|
|
4133
|
+
lock.force_unlock()
|
|
4097
4134
|
try:
|
|
4098
4135
|
with lock:
|
|
4099
4136
|
self.teardown_no_lock(
|
|
@@ -4126,6 +4163,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4126
4163
|
job_ids: Optional[List[int]] = None,
|
|
4127
4164
|
stream_logs: bool = True
|
|
4128
4165
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4166
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4167
|
+
try:
|
|
4168
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4169
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4170
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4171
|
+
).get_job_status(request))
|
|
4172
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4173
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4174
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4175
|
+
}
|
|
4176
|
+
return statuses
|
|
4177
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4178
|
+
pass
|
|
4179
|
+
|
|
4129
4180
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
4130
4181
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4131
4182
|
code,
|
|
@@ -4146,16 +4197,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4146
4197
|
|
|
4147
4198
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
4148
4199
|
"""
|
|
4149
|
-
|
|
4150
|
-
|
|
4151
|
-
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4155
|
-
|
|
4156
|
-
|
|
4157
|
-
|
|
4158
|
-
|
|
4200
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4201
|
+
|
|
4202
|
+
if not use_legacy:
|
|
4203
|
+
try:
|
|
4204
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4205
|
+
cancel_all=cancel_all,
|
|
4206
|
+
user_hash=user_hash)
|
|
4207
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4208
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4209
|
+
request))
|
|
4210
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4211
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4212
|
+
use_legacy = True
|
|
4213
|
+
|
|
4214
|
+
if use_legacy:
|
|
4215
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4216
|
+
user_hash)
|
|
4217
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4218
|
+
code,
|
|
4219
|
+
stream_logs=False,
|
|
4220
|
+
require_outputs=True)
|
|
4221
|
+
subprocess_utils.handle_returncode(
|
|
4222
|
+
returncode, code,
|
|
4223
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4224
|
+
stdout)
|
|
4225
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
4159
4226
|
if cancelled_ids:
|
|
4160
4227
|
logger.info(
|
|
4161
4228
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -4172,20 +4239,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4172
4239
|
Returns:
|
|
4173
4240
|
A dictionary mapping job_id to log path.
|
|
4174
4241
|
"""
|
|
4175
|
-
|
|
4176
|
-
|
|
4242
|
+
job_to_dir: Dict[str, str] = {}
|
|
4243
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4244
|
+
|
|
4245
|
+
if not use_legacy:
|
|
4246
|
+
try:
|
|
4247
|
+
int_job_ids = []
|
|
4248
|
+
if job_ids:
|
|
4249
|
+
for str_job_id in job_ids:
|
|
4250
|
+
if str_job_id.isdigit():
|
|
4251
|
+
int_job_ids.append(int(str_job_id))
|
|
4252
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4253
|
+
job_ids=int_job_ids)
|
|
4254
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4255
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4256
|
+
).get_log_dirs_for_jobs(request))
|
|
4257
|
+
job_log_dirs = response.job_log_dirs
|
|
4258
|
+
if not job_log_dirs:
|
|
4259
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4260
|
+
'No matching log directories found'
|
|
4261
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4262
|
+
return {}
|
|
4263
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4264
|
+
# Convert to string for backwards compatibility
|
|
4265
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4266
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4267
|
+
use_legacy = True
|
|
4268
|
+
|
|
4269
|
+
if use_legacy:
|
|
4270
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4271
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4177
4272
|
code,
|
|
4178
4273
|
stream_logs=False,
|
|
4179
4274
|
require_outputs=True,
|
|
4180
4275
|
separate_stderr=True)
|
|
4181
|
-
|
|
4182
|
-
|
|
4183
|
-
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
|
|
4187
|
-
|
|
4188
|
-
|
|
4276
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4277
|
+
'Failed to sync logs.', stderr)
|
|
4278
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4279
|
+
if not job_to_dir:
|
|
4280
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4281
|
+
'No matching log directories found'
|
|
4282
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4283
|
+
return {}
|
|
4189
4284
|
|
|
4190
4285
|
job_ids = list(job_to_dir.keys())
|
|
4191
4286
|
dirs = list(job_to_dir.values())
|
|
@@ -4195,9 +4290,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4195
4290
|
(dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
|
|
4196
4291
|
constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
|
|
4197
4292
|
]
|
|
4198
|
-
|
|
4199
|
-
|
|
4200
|
-
|
|
4293
|
+
# Include cluster name in local log directory path to avoid conflicts
|
|
4294
|
+
# when the same job_id exists on different clusters
|
|
4295
|
+
cluster_name = handle.cluster_name
|
|
4296
|
+
local_log_dirs = []
|
|
4297
|
+
for remote_log_dir in dirs:
|
|
4298
|
+
if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
|
|
4299
|
+
# Extract the job-specific directory name from the full path
|
|
4300
|
+
# e.g., ~/sky_logs/1-job_name -> 1-job_name
|
|
4301
|
+
job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
|
|
4302
|
+
'').lstrip('/')
|
|
4303
|
+
local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
|
|
4304
|
+
else:
|
|
4305
|
+
# remote_log_dir is already just the job directory name (e.g.,
|
|
4306
|
+
# "1-job_name")
|
|
4307
|
+
local_log_dir = os.path.join(local_dir, cluster_name,
|
|
4308
|
+
remote_log_dir)
|
|
4309
|
+
local_log_dirs.append(local_log_dir)
|
|
4201
4310
|
|
|
4202
4311
|
runners = handle.get_command_runners()
|
|
4203
4312
|
|
|
@@ -4261,6 +4370,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4261
4370
|
The exit code of the tail command. Returns code 100 if the job has
|
|
4262
4371
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
4263
4372
|
"""
|
|
4373
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4374
|
+
last_exit_code = 0
|
|
4375
|
+
try:
|
|
4376
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4377
|
+
job_id=job_id,
|
|
4378
|
+
managed_job_id=managed_job_id,
|
|
4379
|
+
follow=follow,
|
|
4380
|
+
tail=tail)
|
|
4381
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4382
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4383
|
+
).tail_logs(request, timeout=None)):
|
|
4384
|
+
if resp.log_line:
|
|
4385
|
+
print(resp.log_line, end='', flush=True)
|
|
4386
|
+
last_exit_code = resp.exit_code
|
|
4387
|
+
return last_exit_code
|
|
4388
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4389
|
+
pass
|
|
4390
|
+
except grpc.RpcError as e:
|
|
4391
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4392
|
+
return last_exit_code
|
|
4393
|
+
raise e
|
|
4394
|
+
|
|
4264
4395
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
4265
4396
|
managed_job_id=managed_job_id,
|
|
4266
4397
|
follow=follow,
|
|
@@ -4298,6 +4429,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4298
4429
|
tail: Optional[int] = None) -> int:
|
|
4299
4430
|
# if job_name is not None, job_id should be None
|
|
4300
4431
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4432
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4301
4433
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4302
4434
|
job_name, job_id, follow, controller, tail)
|
|
4303
4435
|
|
|
@@ -4343,20 +4475,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4343
4475
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4344
4476
|
|
|
4345
4477
|
if job_id is None:
|
|
4346
|
-
#
|
|
4478
|
+
# get the job_id
|
|
4347
4479
|
# if job_name is None, get all job_ids
|
|
4348
4480
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
4349
|
-
|
|
4350
|
-
|
|
4351
|
-
|
|
4352
|
-
|
|
4353
|
-
|
|
4354
|
-
|
|
4355
|
-
|
|
4356
|
-
|
|
4357
|
-
|
|
4358
|
-
|
|
4359
|
-
|
|
4481
|
+
|
|
4482
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4483
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4484
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4485
|
+
if not use_legacy:
|
|
4486
|
+
try:
|
|
4487
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4488
|
+
job_name=job_name)
|
|
4489
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4490
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4491
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4492
|
+
job_ids = list(response.job_ids)
|
|
4493
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4494
|
+
use_legacy = True
|
|
4495
|
+
|
|
4496
|
+
if use_legacy:
|
|
4497
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4498
|
+
job_name=job_name)
|
|
4499
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4500
|
+
handle,
|
|
4501
|
+
code,
|
|
4502
|
+
stream_logs=False,
|
|
4503
|
+
require_outputs=True,
|
|
4504
|
+
separate_stderr=True)
|
|
4505
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4506
|
+
'Failed to sync down logs.',
|
|
4507
|
+
stderr)
|
|
4508
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
4360
4509
|
if not job_ids:
|
|
4361
4510
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4362
4511
|
'No matching job found'
|
|
@@ -4384,18 +4533,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4384
4533
|
else:
|
|
4385
4534
|
# get the run_timestamp
|
|
4386
4535
|
# the function takes in [job_id]
|
|
4387
|
-
|
|
4388
|
-
|
|
4389
|
-
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4395
|
-
|
|
4396
|
-
|
|
4397
|
-
|
|
4398
|
-
|
|
4536
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4537
|
+
if not use_legacy:
|
|
4538
|
+
try:
|
|
4539
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4540
|
+
job_ids=[job_id])
|
|
4541
|
+
log_dirs_response = (
|
|
4542
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4543
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4544
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4545
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4546
|
+
# Convert back to the expected format
|
|
4547
|
+
# {job_id: run_timestamp}
|
|
4548
|
+
run_timestamps = {}
|
|
4549
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4550
|
+
run_timestamps[int(jid)] = log_dir
|
|
4551
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4552
|
+
use_legacy = True
|
|
4553
|
+
|
|
4554
|
+
if use_legacy:
|
|
4555
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4556
|
+
[str(job_id)])
|
|
4557
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4558
|
+
handle,
|
|
4559
|
+
code,
|
|
4560
|
+
stream_logs=False,
|
|
4561
|
+
require_outputs=True,
|
|
4562
|
+
separate_stderr=True)
|
|
4563
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4564
|
+
'Failed to sync logs.',
|
|
4565
|
+
stderr)
|
|
4566
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4567
|
+
run_timestamps = message_utils.decode_payload(
|
|
4568
|
+
run_timestamps_payload)
|
|
4399
4569
|
if not run_timestamps:
|
|
4400
4570
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4401
4571
|
'No matching log directories found'
|
|
@@ -4462,11 +4632,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4462
4632
|
exist_ok=True)
|
|
4463
4633
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4464
4634
|
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4635
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4636
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4637
|
+
job_name=None,
|
|
4638
|
+
job_id=int(job_id),
|
|
4639
|
+
follow=False,
|
|
4640
|
+
controller=False)
|
|
4470
4641
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4471
4642
|
# kill the process, so we need to handle it manually here.
|
|
4472
4643
|
if threading.current_thread() is threading.main_thread():
|
|
@@ -4507,6 +4678,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4507
4678
|
Raises:
|
|
4508
4679
|
RuntimeError: If the cluster fails to be terminated/stopped.
|
|
4509
4680
|
"""
|
|
4681
|
+
try:
|
|
4682
|
+
handle.close_skylet_ssh_tunnel()
|
|
4683
|
+
except Exception as e: # pylint: disable=broad-except
|
|
4684
|
+
# Not critical to the cluster teardown, just log a warning.
|
|
4685
|
+
logger.warning(
|
|
4686
|
+
'Failed to close Skylet SSH tunnel for cluster '
|
|
4687
|
+
f'{handle.cluster_name}: '
|
|
4688
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
4689
|
+
|
|
4510
4690
|
exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
|
|
4511
4691
|
# We have to kill the cluster requests again within the lock, because
|
|
4512
4692
|
# any pending requests on the same cluster should be cancelled after
|
|
@@ -4543,7 +4723,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4543
4723
|
# observed in AWS. See also
|
|
4544
4724
|
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
4545
4725
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
4546
|
-
|
|
4726
|
+
cluster_lock_already_held=True,
|
|
4727
|
+
retry_if_missing=False))
|
|
4547
4728
|
cluster_status_fetched = True
|
|
4548
4729
|
except exceptions.ClusterStatusFetchingError:
|
|
4549
4730
|
logger.warning(
|
|
@@ -4551,10 +4732,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4551
4732
|
f'{handle.cluster_name!r}. Assuming the cluster is still '
|
|
4552
4733
|
'up.')
|
|
4553
4734
|
if not cluster_status_fetched:
|
|
4554
|
-
|
|
4735
|
+
status = global_user_state.get_status_from_cluster_name(
|
|
4555
4736
|
handle.cluster_name)
|
|
4556
|
-
prev_cluster_status =
|
|
4557
|
-
'status'] if record is not None else None
|
|
4737
|
+
prev_cluster_status = status if status is not None else None
|
|
4558
4738
|
if prev_cluster_status is None:
|
|
4559
4739
|
# When the cluster is not in the cluster table, we guarantee that
|
|
4560
4740
|
# all related resources / cache / config are cleaned up, i.e. it
|
|
@@ -4786,7 +4966,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4786
4966
|
config['provider'])
|
|
4787
4967
|
ports_cleaned_up = True
|
|
4788
4968
|
except exceptions.NotSupportedError:
|
|
4789
|
-
|
|
4969
|
+
ports_cleaned_up = True
|
|
4790
4970
|
except exceptions.PortDoesNotExistError:
|
|
4791
4971
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
4792
4972
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -4811,7 +4991,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4811
4991
|
failover)
|
|
4812
4992
|
custom_multi_network_cleaned_up = True
|
|
4813
4993
|
except exceptions.NotSupportedError:
|
|
4814
|
-
|
|
4994
|
+
custom_multi_network_cleaned_up = True
|
|
4815
4995
|
except Exception as e: # pylint: disable=broad-except
|
|
4816
4996
|
if purge:
|
|
4817
4997
|
msg = common_utils.format_exception(e, use_bracket=True)
|
|
@@ -4913,7 +5093,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4913
5093
|
cluster_yaml_path = handle.cluster_yaml
|
|
4914
5094
|
handle.cluster_yaml = None
|
|
4915
5095
|
global_user_state.update_cluster_handle(handle.cluster_name, handle)
|
|
4916
|
-
|
|
5096
|
+
# Removing the cluster YAML can cause some unexpected stability issues.
|
|
5097
|
+
# See #5011.
|
|
5098
|
+
# global_user_state.remove_cluster_yaml(handle.cluster_name)
|
|
4917
5099
|
common_utils.remove_file_if_exists(cluster_yaml_path)
|
|
4918
5100
|
|
|
4919
5101
|
def set_autostop(self,
|
|
@@ -4974,9 +5156,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4974
5156
|
autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
|
|
4975
5157
|
down=down,
|
|
4976
5158
|
)
|
|
4977
|
-
backend_utils.invoke_skylet_with_retries(
|
|
4978
|
-
handle
|
|
4979
|
-
set_autostop(request))
|
|
5159
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
5160
|
+
handle.get_grpc_channel()).set_autostop(request))
|
|
4980
5161
|
else:
|
|
4981
5162
|
code = autostop_lib.AutostopCodeGen.set_autostop(
|
|
4982
5163
|
idle_minutes_to_autostop, self.NAME, wait_for, down)
|
|
@@ -5015,8 +5196,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5015
5196
|
try:
|
|
5016
5197
|
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
5017
5198
|
response = backend_utils.invoke_skylet_with_retries(
|
|
5018
|
-
|
|
5019
|
-
|
|
5199
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
5200
|
+
).is_autostopping(request))
|
|
5020
5201
|
return response.is_autostopping
|
|
5021
5202
|
except Exception as e: # pylint: disable=broad-except
|
|
5022
5203
|
# The cluster may have been terminated, causing the gRPC call
|
|
@@ -5128,7 +5309,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5128
5309
|
exceptions.InvalidClusterNameError: If the cluster name is invalid.
|
|
5129
5310
|
# TODO(zhwu): complete the list of exceptions.
|
|
5130
5311
|
"""
|
|
5131
|
-
record = global_user_state.get_cluster_from_name(
|
|
5312
|
+
record = global_user_state.get_cluster_from_name(
|
|
5313
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
5132
5314
|
if record is None:
|
|
5133
5315
|
handle_before_refresh = None
|
|
5134
5316
|
status_before_refresh = None
|
|
@@ -5148,7 +5330,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5148
5330
|
record = backend_utils.refresh_cluster_record(
|
|
5149
5331
|
cluster_name,
|
|
5150
5332
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5151
|
-
|
|
5333
|
+
cluster_lock_already_held=True,
|
|
5334
|
+
include_user_info=False,
|
|
5335
|
+
summary_response=True,
|
|
5152
5336
|
)
|
|
5153
5337
|
if record is not None:
|
|
5154
5338
|
prev_cluster_status = record['status']
|
|
@@ -5264,33 +5448,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5264
5448
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
|
5265
5449
|
|
|
5266
5450
|
if to_provision is None:
|
|
5267
|
-
#
|
|
5268
|
-
#
|
|
5269
|
-
#
|
|
5270
|
-
#
|
|
5271
|
-
#
|
|
5272
|
-
#
|
|
5273
|
-
#
|
|
5274
|
-
|
|
5275
|
-
|
|
5276
|
-
|
|
5277
|
-
|
|
5278
|
-
handle_before_refresh,
|
|
5279
|
-
|
|
5280
|
-
|
|
5281
|
-
|
|
5282
|
-
|
|
5283
|
-
|
|
5284
|
-
|
|
5285
|
-
|
|
5286
|
-
|
|
5287
|
-
|
|
5288
|
-
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
|
|
5293
|
-
|
|
5451
|
+
# Recently terminated after refresh. OPTIMIZE usually ran outside
|
|
5452
|
+
# the lock, so that decision may be stale by now. Under the lock,
|
|
5453
|
+
# ensure we always have a concrete plan via the following order:
|
|
5454
|
+
# 1) Reuse last placement snapshot (if available);
|
|
5455
|
+
# 2) Else, call injected planner for a fresh plan.
|
|
5456
|
+
# If we still have a pre-refresh handle snapshot with a concrete
|
|
5457
|
+
# placement, prefer reusing it.
|
|
5458
|
+
if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
|
|
5459
|
+
handle_before_refresh.launched_resources is not None):
|
|
5460
|
+
to_provision = handle_before_refresh.launched_resources
|
|
5461
|
+
# Ensure the requested task fits the previous placement.
|
|
5462
|
+
self.check_resources_fit_cluster(handle_before_refresh, task)
|
|
5463
|
+
# Mirror the original message for reuse path.
|
|
5464
|
+
status_before_refresh_str = None
|
|
5465
|
+
if status_before_refresh is not None:
|
|
5466
|
+
status_before_refresh_str = status_before_refresh.value
|
|
5467
|
+
logger.info(
|
|
5468
|
+
f'The cluster {cluster_name!r} (status: '
|
|
5469
|
+
f'{status_before_refresh_str}) was not found on the cloud: '
|
|
5470
|
+
'it may be autodowned, manually terminated, or its launch '
|
|
5471
|
+
'never succeeded. Provisioning a new cluster by using the '
|
|
5472
|
+
'same resources as its original launch.')
|
|
5473
|
+
elif self._planner is not None:
|
|
5474
|
+
to_provision = self._planner(task)
|
|
5475
|
+
logger.info(
|
|
5476
|
+
'Previous placement snapshot missing; computing a fresh '
|
|
5477
|
+
'plan for provisioning.')
|
|
5478
|
+
else:
|
|
5479
|
+
# Without a snapshot or planner, we cannot proceed safely.
|
|
5480
|
+
# Surface a user-friendly error without a long traceback.
|
|
5481
|
+
with ux_utils.print_exception_no_traceback():
|
|
5482
|
+
raise RuntimeError(
|
|
5483
|
+
'No concrete launch plan available after recent cloud '
|
|
5484
|
+
f'termination of cluster {cluster_name!r}. Ensure the '
|
|
5485
|
+
'OPTIMIZE stage runs or provide concrete resources.')
|
|
5294
5486
|
|
|
5295
5487
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
5296
5488
|
cluster_name,
|
|
@@ -5639,7 +5831,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5639
5831
|
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
|
|
5640
5832
|
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
|
|
5641
5833
|
"""Returns the environment variables for the task."""
|
|
5642
|
-
env_vars =
|
|
5834
|
+
env_vars = task_lib.get_plaintext_envs_and_secrets(
|
|
5835
|
+
task.envs_and_secrets)
|
|
5643
5836
|
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
|
|
5644
5837
|
# by the controller.
|
|
5645
5838
|
if constants.TASK_ID_ENV_VAR not in env_vars:
|
|
@@ -5651,9 +5844,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5651
5844
|
env_vars.update(self._skypilot_predefined_env_vars(handle))
|
|
5652
5845
|
return env_vars
|
|
5653
5846
|
|
|
5847
|
+
def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
|
|
5848
|
+
"""Returns the user id for the managed job."""
|
|
5849
|
+
if task.managed_job_dag is not None:
|
|
5850
|
+
return task.envs[constants.USER_ID_ENV_VAR]
|
|
5851
|
+
return None
|
|
5852
|
+
|
|
5654
5853
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5655
5854
|
task: task_lib.Task, job_id: int,
|
|
5656
|
-
|
|
5855
|
+
remote_log_dir: str) -> None:
|
|
5657
5856
|
# Launch the command as a Ray task.
|
|
5658
5857
|
log_dir = os.path.join(remote_log_dir, 'tasks')
|
|
5659
5858
|
|
|
@@ -5663,9 +5862,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5663
5862
|
|
|
5664
5863
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5665
5864
|
|
|
5666
|
-
codegen = RayCodeGen()
|
|
5865
|
+
codegen = task_codegen.RayCodeGen()
|
|
5667
5866
|
codegen.add_prologue(job_id)
|
|
5668
|
-
codegen.
|
|
5867
|
+
codegen.add_setup(
|
|
5669
5868
|
1,
|
|
5670
5869
|
resources_dict,
|
|
5671
5870
|
stable_cluster_internal_ips=internal_ips,
|
|
@@ -5674,31 +5873,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5674
5873
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5675
5874
|
)
|
|
5676
5875
|
|
|
5677
|
-
|
|
5678
|
-
|
|
5679
|
-
|
|
5680
|
-
codegen.register_run_fn(run_fn_code, run_fn_name)
|
|
5681
|
-
|
|
5682
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5683
|
-
codegen.add_ray_task(
|
|
5684
|
-
bash_script=command_for_node,
|
|
5876
|
+
codegen.add_task(
|
|
5877
|
+
1,
|
|
5878
|
+
bash_script=task.run,
|
|
5685
5879
|
env_vars=task_env_vars,
|
|
5686
5880
|
task_name=task.name,
|
|
5687
|
-
|
|
5881
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5688
5882
|
log_dir=log_dir)
|
|
5689
5883
|
|
|
5690
5884
|
codegen.add_epilogue()
|
|
5691
5885
|
|
|
5692
|
-
self._exec_code_on_head(
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5886
|
+
self._exec_code_on_head(
|
|
5887
|
+
handle,
|
|
5888
|
+
codegen.build(),
|
|
5889
|
+
job_id,
|
|
5890
|
+
managed_job_dag=task.managed_job_dag,
|
|
5891
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5892
|
+
remote_log_dir=remote_log_dir)
|
|
5698
5893
|
|
|
5699
5894
|
def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
|
|
5700
5895
|
task: task_lib.Task, job_id: int,
|
|
5701
|
-
|
|
5896
|
+
remote_log_dir: str) -> None:
|
|
5702
5897
|
# Strategy:
|
|
5703
5898
|
# ray.init(...)
|
|
5704
5899
|
# for node:
|
|
@@ -5712,9 +5907,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5712
5907
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
5713
5908
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5714
5909
|
|
|
5715
|
-
codegen = RayCodeGen()
|
|
5910
|
+
codegen = task_codegen.RayCodeGen()
|
|
5716
5911
|
codegen.add_prologue(job_id)
|
|
5717
|
-
codegen.
|
|
5912
|
+
codegen.add_setup(
|
|
5718
5913
|
num_actual_nodes,
|
|
5719
5914
|
resources_dict,
|
|
5720
5915
|
stable_cluster_internal_ips=internal_ips,
|
|
@@ -5723,31 +5918,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5723
5918
|
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5724
5919
|
)
|
|
5725
5920
|
|
|
5726
|
-
|
|
5727
|
-
|
|
5728
|
-
|
|
5729
|
-
|
|
5730
|
-
|
|
5731
|
-
|
|
5732
|
-
|
|
5733
|
-
for i in range(num_actual_nodes):
|
|
5734
|
-
command_for_node = task.run if isinstance(task.run, str) else None
|
|
5735
|
-
|
|
5736
|
-
# Ray's per-node resources, to constrain scheduling each command to
|
|
5737
|
-
# the corresponding node, represented by private IPs.
|
|
5738
|
-
codegen.add_ray_task(
|
|
5739
|
-
bash_script=command_for_node,
|
|
5740
|
-
env_vars=task_env_vars,
|
|
5741
|
-
task_name=task.name,
|
|
5742
|
-
ray_resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5743
|
-
log_dir=log_dir,
|
|
5744
|
-
gang_scheduling_id=i)
|
|
5921
|
+
codegen.add_task(
|
|
5922
|
+
num_actual_nodes,
|
|
5923
|
+
bash_script=task.run,
|
|
5924
|
+
env_vars=task_env_vars,
|
|
5925
|
+
task_name=task.name,
|
|
5926
|
+
resources_dict=backend_utils.get_task_demands_dict(task),
|
|
5927
|
+
log_dir=log_dir)
|
|
5745
5928
|
|
|
5746
5929
|
codegen.add_epilogue()
|
|
5747
5930
|
# TODO(zhanghao): Add help info for downloading logs.
|
|
5748
|
-
self._exec_code_on_head(
|
|
5749
|
-
|
|
5750
|
-
|
|
5751
|
-
|
|
5752
|
-
|
|
5753
|
-
|
|
5931
|
+
self._exec_code_on_head(
|
|
5932
|
+
handle,
|
|
5933
|
+
codegen.build(),
|
|
5934
|
+
job_id,
|
|
5935
|
+
managed_job_dag=task.managed_job_dag,
|
|
5936
|
+
managed_job_user_id=self._get_managed_job_user_id(task),
|
|
5937
|
+
remote_log_dir=remote_log_dir)
|