skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -81,6 +81,7 @@ from sky.utils import timeline
|
|
|
81
81
|
from sky.utils import ux_utils
|
|
82
82
|
from sky.utils import volume as volume_lib
|
|
83
83
|
from sky.utils import yaml_utils
|
|
84
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
84
85
|
|
|
85
86
|
if typing.TYPE_CHECKING:
|
|
86
87
|
import grpc
|
|
@@ -192,18 +193,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
192
193
|
pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
|
|
193
194
|
'monkey_patches' / 'monkey_patch_ray_up.py')
|
|
194
195
|
|
|
195
|
-
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
196
|
-
# executed with /bin/sh should be less than 128KB.
|
|
197
|
-
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
|
|
198
|
-
#
|
|
199
|
-
# If a user have very long run or setup commands, the generated command may
|
|
200
|
-
# exceed the limit, as we directly include scripts in job submission commands.
|
|
201
|
-
# If the command is too long, we instead write it to a file, rsync and execute
|
|
202
|
-
# it.
|
|
203
|
-
#
|
|
204
|
-
# We use 100KB as a threshold to be safe for other arguments that
|
|
205
|
-
# might be added during ssh.
|
|
206
|
-
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
207
196
|
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
208
197
|
('too long', 255),
|
|
209
198
|
('request-uri too large', 1),
|
|
@@ -218,18 +207,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
|
|
|
218
207
|
_CLUSTER_LOCK_TIMEOUT = 5.0
|
|
219
208
|
|
|
220
209
|
|
|
221
|
-
def _is_command_length_over_limit(command: str) -> bool:
|
|
222
|
-
"""Check if the length of the command exceeds the limit.
|
|
223
|
-
|
|
224
|
-
We calculate the length of the command after quoting the command twice as
|
|
225
|
-
when it is executed by the CommandRunner, the command will be quoted twice
|
|
226
|
-
to ensure the correctness, which will add significant length to the command.
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
quoted_length = len(shlex.quote(shlex.quote(command)))
|
|
230
|
-
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
231
|
-
|
|
232
|
-
|
|
233
210
|
def _is_message_too_long(returncode: int,
|
|
234
211
|
output: Optional[str] = None,
|
|
235
212
|
file_path: Optional[str] = None) -> bool:
|
|
@@ -294,6 +271,7 @@ def _get_cluster_config_template(cloud):
|
|
|
294
271
|
clouds.Lambda: 'lambda-ray.yml.j2',
|
|
295
272
|
clouds.IBM: 'ibm-ray.yml.j2',
|
|
296
273
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
274
|
+
clouds.Slurm: 'slurm-ray.yml.j2',
|
|
297
275
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
298
276
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
299
277
|
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
@@ -938,8 +916,10 @@ class RetryingVmProvisioner(object):
|
|
|
938
916
|
elif to_provision.region is not None and to_provision.cloud is not None:
|
|
939
917
|
# For public clouds, provision.region is always set.
|
|
940
918
|
if clouds.SSH().is_same_cloud(to_provision.cloud):
|
|
919
|
+
ssh_node_pool_name = common_utils.removeprefix(
|
|
920
|
+
to_provision.region, 'ssh-')
|
|
941
921
|
message += (
|
|
942
|
-
f'in SSH Node Pool ({
|
|
922
|
+
f'in SSH Node Pool ({ssh_node_pool_name}) '
|
|
943
923
|
f'for {requested_resources}. The SSH Node Pool may not '
|
|
944
924
|
'have enough resources.')
|
|
945
925
|
elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
|
|
@@ -1199,7 +1179,9 @@ class RetryingVmProvisioner(object):
|
|
|
1199
1179
|
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
1200
1180
|
suffix = '.'
|
|
1201
1181
|
if region.name.startswith('ssh-'):
|
|
1202
|
-
|
|
1182
|
+
ssh_node_pool_name = common_utils.removeprefix(
|
|
1183
|
+
region.name, 'ssh-')
|
|
1184
|
+
suffix = f' ({ssh_node_pool_name})'
|
|
1203
1185
|
logger.info(
|
|
1204
1186
|
ux_utils.starting_message(
|
|
1205
1187
|
f'Launching{controller_str} on '
|
|
@@ -2516,7 +2498,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2516
2498
|
@property
|
|
2517
2499
|
def is_grpc_enabled_with_flag(self) -> bool:
|
|
2518
2500
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2519
|
-
return env_options.Options.ENABLE_GRPC.get() and
|
|
2501
|
+
return (env_options.Options.ENABLE_GRPC.get() and
|
|
2502
|
+
self.is_grpc_enabled and
|
|
2503
|
+
not isinstance(self.launched_resources.cloud, clouds.Slurm))
|
|
2520
2504
|
|
|
2521
2505
|
def __getstate__(self):
|
|
2522
2506
|
state = self.__dict__.copy()
|
|
@@ -2753,6 +2737,13 @@ class SkyletClient:
|
|
|
2753
2737
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
2754
2738
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
2755
2739
|
|
|
2740
|
+
def get_job_exit_codes(
|
|
2741
|
+
self,
|
|
2742
|
+
request: 'jobsv1_pb2.GetJobExitCodesRequest',
|
|
2743
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
2744
|
+
) -> 'jobsv1_pb2.GetJobExitCodesResponse':
|
|
2745
|
+
return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
|
|
2746
|
+
|
|
2756
2747
|
def tail_logs(
|
|
2757
2748
|
self,
|
|
2758
2749
|
request: 'jobsv1_pb2.TailLogsRequest',
|
|
@@ -3061,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3061
3052
|
'sky api status -v | grep '
|
|
3062
3053
|
f'{cluster_name}'))
|
|
3063
3054
|
|
|
3055
|
+
def _maybe_clear_external_cluster_failures(
|
|
3056
|
+
self, cluster_name: str,
|
|
3057
|
+
prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
|
|
3058
|
+
"""Clear any existing cluster failures when reusing a cluster.
|
|
3059
|
+
|
|
3060
|
+
Clear any existing cluster failures when reusing a cluster. This ensures
|
|
3061
|
+
that when a cluster failure is detected (causing the cluster to be
|
|
3062
|
+
marked as INIT), the user can recover the cluster via `sky start` or
|
|
3063
|
+
`sky launch` and clear the failure.
|
|
3064
|
+
"""
|
|
3065
|
+
if prev_cluster_status is not None:
|
|
3066
|
+
failures = ExternalFailureSource.clear(cluster_name=cluster_name)
|
|
3067
|
+
if failures:
|
|
3068
|
+
failure_details = [f'"{f["failure_mode"]}"' for f in failures]
|
|
3069
|
+
plural = 's' if len(failures) > 1 else ''
|
|
3070
|
+
logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
|
|
3071
|
+
f'existing cluster failure{plural} for cluster '
|
|
3072
|
+
f'{cluster_name!r}: {", ".join(failure_details)}'
|
|
3073
|
+
f'{colorama.Style.RESET_ALL}')
|
|
3074
|
+
|
|
3064
3075
|
def _locked_provision(
|
|
3065
3076
|
self,
|
|
3066
3077
|
lock_id: str,
|
|
@@ -3091,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3091
3102
|
to_provision_config.num_nodes, to_provision_config.resources)
|
|
3092
3103
|
usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
|
|
3093
3104
|
|
|
3105
|
+
self._maybe_clear_external_cluster_failures(cluster_name,
|
|
3106
|
+
prev_cluster_status)
|
|
3107
|
+
|
|
3094
3108
|
# TODO(suquark): once we have sky on PyPI, we should directly
|
|
3095
3109
|
# install sky from PyPI.
|
|
3096
3110
|
# NOTE: can take ~2s.
|
|
@@ -3449,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3449
3463
|
ssh_user=handle.ssh_user,
|
|
3450
3464
|
docker_user=handle.docker_user)
|
|
3451
3465
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
3452
|
-
handle.cluster_name, handle.
|
|
3466
|
+
handle.cluster_name, handle.cluster_name_on_cloud,
|
|
3467
|
+
handle.cached_external_ips, auth_config,
|
|
3453
3468
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3454
3469
|
handle.ssh_user)
|
|
3455
3470
|
|
|
@@ -3596,6 +3611,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3596
3611
|
|
|
3597
3612
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
|
3598
3613
|
detach_setup: bool) -> None:
|
|
3614
|
+
|
|
3599
3615
|
start = time.time()
|
|
3600
3616
|
|
|
3601
3617
|
if task.setup is None:
|
|
@@ -3647,7 +3663,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3647
3663
|
_dump_final_script(setup_script,
|
|
3648
3664
|
constants.PERSISTENT_SETUP_SCRIPT_PATH)
|
|
3649
3665
|
|
|
3650
|
-
if detach_setup or
|
|
3666
|
+
if (detach_setup or
|
|
3667
|
+
backend_utils.is_command_length_over_limit(encoded_script)):
|
|
3651
3668
|
_dump_final_script(setup_script)
|
|
3652
3669
|
create_script_code = 'true'
|
|
3653
3670
|
else:
|
|
@@ -3788,23 +3805,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3788
3805
|
up=True,
|
|
3789
3806
|
stream_logs=False)
|
|
3790
3807
|
|
|
3791
|
-
|
|
3792
|
-
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3793
|
-
f'touch {remote_log_path}')
|
|
3808
|
+
mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
|
|
3794
3809
|
encoded_script = shlex.quote(codegen)
|
|
3795
3810
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3796
3811
|
job_submit_cmd = (
|
|
3797
3812
|
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3798
3813
|
# retrieved with pid is the same driver process.
|
|
3799
3814
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3800
|
-
f'{
|
|
3815
|
+
f'{constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3801
3816
|
# Do not use &>, which is not POSIX and may not work.
|
|
3802
3817
|
# Note that the order of ">filename 2>&1" matters.
|
|
3803
3818
|
f'> {remote_log_path} 2>&1')
|
|
3804
3819
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3820
|
+
|
|
3821
|
+
# For Slurm, we need to wait for the job to complete before exiting,
|
|
3822
|
+
# because Slurm's proctrack/cgroup kills all processes when the srun
|
|
3823
|
+
# job step ends, including child processes launched as a separate
|
|
3824
|
+
# process group.
|
|
3825
|
+
# So this keeps srun alive so the job driver process that was spawned
|
|
3826
|
+
# (and runs in the background) by job_lib.JobScheduler.schedule_step()
|
|
3827
|
+
# does not get killed.
|
|
3828
|
+
# Note: proctrack/cgroup is enabled by default on Nebius' Managed
|
|
3829
|
+
# Soperator.
|
|
3830
|
+
is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
|
|
3831
|
+
if is_slurm:
|
|
3832
|
+
wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
|
|
3833
|
+
code = code + ' && ' + wait_code
|
|
3834
|
+
|
|
3805
3835
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3806
3836
|
|
|
3807
|
-
# Should also be ealier than
|
|
3837
|
+
# Should also be ealier than is_command_length_over_limit
|
|
3808
3838
|
# Same reason as in _setup
|
|
3809
3839
|
if self._dump_final_script:
|
|
3810
3840
|
_dump_code_to_file(job_submit_cmd,
|
|
@@ -3837,7 +3867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3837
3867
|
tasks=managed_job_tasks,
|
|
3838
3868
|
user_id=managed_job_user_id)
|
|
3839
3869
|
|
|
3840
|
-
if
|
|
3870
|
+
if backend_utils.is_command_length_over_limit(codegen):
|
|
3841
3871
|
_dump_code_to_file(codegen)
|
|
3842
3872
|
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3843
3873
|
job_id=job_id,
|
|
@@ -3859,7 +3889,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3859
3889
|
use_legacy = True
|
|
3860
3890
|
|
|
3861
3891
|
if use_legacy:
|
|
3862
|
-
if
|
|
3892
|
+
if backend_utils.is_command_length_over_limit(job_submit_cmd):
|
|
3863
3893
|
_dump_code_to_file(codegen)
|
|
3864
3894
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3865
3895
|
|
|
@@ -3886,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3886
3916
|
|
|
3887
3917
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3888
3918
|
|
|
3889
|
-
|
|
3890
|
-
|
|
3891
|
-
|
|
3892
|
-
|
|
3919
|
+
# For Slurm, run in background so that SSH returns immediately.
|
|
3920
|
+
# This is needed because we add the wait_for_job code above which
|
|
3921
|
+
# makes the command block until the job completes.
|
|
3922
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3923
|
+
handle,
|
|
3924
|
+
job_submit_cmd,
|
|
3925
|
+
stream_logs=False,
|
|
3926
|
+
require_outputs=True,
|
|
3927
|
+
run_in_background=is_slurm)
|
|
3893
3928
|
# Happens when someone calls `sky exec` but remote is outdated for
|
|
3894
3929
|
# running a job. Necessitating calling `sky launch`.
|
|
3895
3930
|
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
@@ -3906,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3906
3941
|
_dump_code_to_file(codegen)
|
|
3907
3942
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3908
3943
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
3944
|
+
# See comment above for why run_in_background=is_slurm.
|
|
3909
3945
|
returncode, stdout, stderr = self.run_on_head(
|
|
3910
3946
|
handle,
|
|
3911
3947
|
job_submit_cmd,
|
|
3912
3948
|
stream_logs=False,
|
|
3913
|
-
require_outputs=True
|
|
3949
|
+
require_outputs=True,
|
|
3950
|
+
run_in_background=is_slurm)
|
|
3914
3951
|
|
|
3915
3952
|
subprocess_utils.handle_returncode(
|
|
3916
3953
|
returncode,
|
|
@@ -4969,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4969
5006
|
ports_cleaned_up = True
|
|
4970
5007
|
except exceptions.PortDoesNotExistError:
|
|
4971
5008
|
logger.debug('Ports do not exist. Skipping cleanup.')
|
|
5009
|
+
ports_cleaned_up = True
|
|
4972
5010
|
except Exception as e: # pylint: disable=broad-except
|
|
4973
5011
|
if purge:
|
|
4974
5012
|
msg = common_utils.format_exception(e, use_bracket=True)
|
|
@@ -5041,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5041
5079
|
config['provider'],
|
|
5042
5080
|
non_terminated_only=False)
|
|
5043
5081
|
|
|
5044
|
-
|
|
5082
|
+
unexpected_nodes = []
|
|
5045
5083
|
for node_id, node_status_tuple in node_status_dict.items():
|
|
5046
5084
|
node_status, reason = node_status_tuple
|
|
5047
|
-
|
|
5048
|
-
logger.debug(f'{node_id} status: {node_status}{
|
|
5085
|
+
reason_str = '' if reason is None else f' ({reason})'
|
|
5086
|
+
logger.debug(f'{node_id} status: {node_status}{reason_str}')
|
|
5049
5087
|
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
|
5050
5088
|
# between "stopping/stopped" and "terminating/terminated",
|
|
5051
5089
|
# so we allow for either status instead of casing on
|
|
@@ -5053,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5053
5091
|
if node_status not in [
|
|
5054
5092
|
None, status_lib.ClusterStatus.STOPPED
|
|
5055
5093
|
]:
|
|
5056
|
-
|
|
5057
|
-
break
|
|
5094
|
+
unexpected_nodes.append((node_id, node_status, reason))
|
|
5058
5095
|
|
|
5059
|
-
if
|
|
5096
|
+
if not unexpected_nodes:
|
|
5060
5097
|
break
|
|
5061
5098
|
|
|
5062
5099
|
attempts += 1
|
|
5063
5100
|
if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
|
|
5064
5101
|
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
|
5065
5102
|
else:
|
|
5066
|
-
|
|
5067
|
-
|
|
5068
|
-
|
|
5103
|
+
unexpected_nodes_str = '\n'.join([
|
|
5104
|
+
f' - {node_id}: {node_status}' +
|
|
5105
|
+
(f' ({reason})' if reason else '')
|
|
5106
|
+
for node_id, node_status, reason in unexpected_nodes
|
|
5107
|
+
])
|
|
5108
|
+
raise RuntimeError(f'Instances in unexpected state:\n'
|
|
5109
|
+
f'{unexpected_nodes_str}')
|
|
5069
5110
|
|
|
5070
5111
|
# If cluster_yaml is None, the cluster should ensured to be terminated,
|
|
5071
5112
|
# so we don't need to do the double check.
|
|
@@ -5352,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5352
5393
|
assert handle is not None
|
|
5353
5394
|
# Cluster already exists.
|
|
5354
5395
|
self.check_resources_fit_cluster(handle, task)
|
|
5396
|
+
|
|
5355
5397
|
# Use the existing cluster.
|
|
5356
5398
|
assert handle.launched_resources is not None, (cluster_name, handle)
|
|
5357
5399
|
# Take a random resource in order to get resource info that applies
|
|
@@ -5403,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5403
5445
|
for resource in task.resources:
|
|
5404
5446
|
assert (resource.cluster_config_overrides ==
|
|
5405
5447
|
one_task_resource.cluster_config_overrides)
|
|
5406
|
-
|
|
5448
|
+
|
|
5449
|
+
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5450
|
+
cluster_name)
|
|
5451
|
+
cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
|
|
5452
|
+
if cluster_yaml_str is not None else None)
|
|
5453
|
+
|
|
5454
|
+
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5455
|
+
return (yaml_obj.get('available_node_types',
|
|
5456
|
+
{}).get('ray_head_default',
|
|
5457
|
+
{}).get('node_config', {}))
|
|
5458
|
+
|
|
5459
|
+
if isinstance(to_provision.cloud,
|
|
5460
|
+
clouds.Kubernetes) and cluster_yaml_obj is not None:
|
|
5407
5461
|
# Warn users if the Kubernetes pod config is different
|
|
5408
5462
|
# from the existing cluster.
|
|
5409
|
-
cluster_yaml_str = global_user_state.get_cluster_yaml_str(
|
|
5410
|
-
cluster_name)
|
|
5411
|
-
actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
|
|
5412
5463
|
desired_cluster_yaml_obj = (
|
|
5413
5464
|
kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
5414
|
-
|
|
5465
|
+
cluster_yaml_obj,
|
|
5415
5466
|
cluster_config_overrides=one_task_resource.
|
|
5416
5467
|
cluster_config_overrides,
|
|
5417
5468
|
cloud=to_provision.cloud,
|
|
5418
5469
|
context=to_provision.region))
|
|
5419
5470
|
|
|
5420
|
-
def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
5421
|
-
return (yaml_obj.get('available_node_types',
|
|
5422
|
-
{}).get('ray_head_default',
|
|
5423
|
-
{}).get('node_config', {}))
|
|
5424
|
-
|
|
5425
5471
|
if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
|
|
5426
|
-
|
|
5472
|
+
cluster_yaml_obj):
|
|
5427
5473
|
# pylint: disable=line-too-long
|
|
5428
5474
|
logger.warning(
|
|
5429
5475
|
f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
|
|
@@ -5434,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5434
5480
|
f' • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
|
|
5435
5481
|
f'{colorama.Style.RESET_ALL}')
|
|
5436
5482
|
|
|
5483
|
+
# Check for volume mount warnings
|
|
5484
|
+
if task.volume_mounts:
|
|
5485
|
+
# Get existing cluster's volume mounts from cluster yaml
|
|
5486
|
+
existing_volume_names = set()
|
|
5487
|
+
try:
|
|
5488
|
+
if cluster_yaml_obj is not None:
|
|
5489
|
+
# Extract volume names from existing cluster
|
|
5490
|
+
node_config = _get_pod_config(cluster_yaml_obj)
|
|
5491
|
+
|
|
5492
|
+
if isinstance(to_provision.cloud, clouds.Kubernetes):
|
|
5493
|
+
# Check for K8s-style persistent volumes
|
|
5494
|
+
# (spec.volumes)
|
|
5495
|
+
# See sky/templates/kubernetes-ray.yml.j2.
|
|
5496
|
+
volumes = node_config.get('spec',
|
|
5497
|
+
{}).get('volumes', [])
|
|
5498
|
+
for vol in volumes:
|
|
5499
|
+
# Volume from PVC has structure:
|
|
5500
|
+
# - name: <volume_name>
|
|
5501
|
+
# persistentVolumeClaim:
|
|
5502
|
+
# claimName: <volume_name_on_cloud>
|
|
5503
|
+
if 'persistentVolumeClaim' in vol:
|
|
5504
|
+
pvc = vol.get('persistentVolumeClaim', {})
|
|
5505
|
+
# Use claimName (volume_name_on_cloud) to
|
|
5506
|
+
# be consistent with RunPod.
|
|
5507
|
+
vol_name_on_cloud = pvc.get('claimName')
|
|
5508
|
+
if vol_name_on_cloud:
|
|
5509
|
+
existing_volume_names.add(
|
|
5510
|
+
vol_name_on_cloud)
|
|
5511
|
+
|
|
5512
|
+
# Check for K8s ephemeral volumes
|
|
5513
|
+
# See sky/templates/kubernetes-ray.yml.j2.
|
|
5514
|
+
provider_config = cluster_yaml_obj.get(
|
|
5515
|
+
'provider', {})
|
|
5516
|
+
ephemeral_specs = provider_config.get(
|
|
5517
|
+
'ephemeral_volume_specs', [])
|
|
5518
|
+
for spec in ephemeral_specs:
|
|
5519
|
+
# For ephemeral volumes, we check the mount
|
|
5520
|
+
# path.
|
|
5521
|
+
mount_path = spec.get('path')
|
|
5522
|
+
if mount_path:
|
|
5523
|
+
existing_volume_names.add(mount_path)
|
|
5524
|
+
|
|
5525
|
+
elif isinstance(to_provision.cloud, clouds.RunPod):
|
|
5526
|
+
# Check for custom VolumeMounts config
|
|
5527
|
+
# (e.g. RunPod)
|
|
5528
|
+
# See sky/templates/runpod-ray.yml.j2.
|
|
5529
|
+
volume_mounts_config = node_config.get(
|
|
5530
|
+
'VolumeMounts', [])
|
|
5531
|
+
for vol_mount in volume_mounts_config:
|
|
5532
|
+
vol_name = vol_mount.get('VolumeNameOnCloud')
|
|
5533
|
+
if vol_name:
|
|
5534
|
+
existing_volume_names.add(vol_name)
|
|
5535
|
+
except Exception as e: # pylint: disable=broad-except
|
|
5536
|
+
# If we can't get the existing volume mounts, log debug
|
|
5537
|
+
# and skip the warning check
|
|
5538
|
+
logger.debug(f'Failed to check existing volume mounts: {e}',
|
|
5539
|
+
exc_info=True)
|
|
5540
|
+
|
|
5541
|
+
# Check if task has new volumes not in existing cluster
|
|
5542
|
+
new_ephemeral_volumes = []
|
|
5543
|
+
new_persistent_volumes = []
|
|
5544
|
+
for volume_mount in task.volume_mounts:
|
|
5545
|
+
# Compare using volume_name for user-facing name
|
|
5546
|
+
if volume_mount.is_ephemeral:
|
|
5547
|
+
if volume_mount.path not in existing_volume_names:
|
|
5548
|
+
new_ephemeral_volumes.append(volume_mount.path)
|
|
5549
|
+
elif (volume_mount.volume_name not in existing_volume_names
|
|
5550
|
+
and volume_mount.volume_config.name_on_cloud
|
|
5551
|
+
not in existing_volume_names):
|
|
5552
|
+
new_persistent_volumes.append(volume_mount.volume_name)
|
|
5553
|
+
|
|
5554
|
+
if new_ephemeral_volumes or new_persistent_volumes:
|
|
5555
|
+
msg_parts = []
|
|
5556
|
+
if new_ephemeral_volumes:
|
|
5557
|
+
msg_parts.append(f'new ephemeral volume(s) with path '
|
|
5558
|
+
f'{", ".join(new_ephemeral_volumes)}')
|
|
5559
|
+
if new_persistent_volumes:
|
|
5560
|
+
msg_parts.append(
|
|
5561
|
+
f'new volume(s) {", ".join(new_persistent_volumes)}'
|
|
5562
|
+
)
|
|
5563
|
+
|
|
5564
|
+
volume_msg = ' and '.join(msg_parts)
|
|
5565
|
+
# Capitalize the first letter of the message
|
|
5566
|
+
volume_msg = volume_msg[0].upper() + volume_msg[1:]
|
|
5567
|
+
|
|
5568
|
+
logger.warning(
|
|
5569
|
+
f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
|
|
5570
|
+
f'specified in task but not '
|
|
5571
|
+
f'mounted to existing cluster "{cluster_name}". '
|
|
5572
|
+
f'These volumes will not be mounted to the cluster. '
|
|
5573
|
+
f'To mount new volumes, either:\n'
|
|
5574
|
+
f' • Use a new cluster, or\n'
|
|
5575
|
+
f' • Terminate and recreate this cluster'
|
|
5576
|
+
f'{colorama.Style.RESET_ALL}')
|
|
5577
|
+
|
|
5437
5578
|
return RetryingVmProvisioner.ToProvisionConfig(
|
|
5438
5579
|
cluster_name,
|
|
5439
5580
|
to_provision,
|
|
@@ -5850,6 +5991,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5850
5991
|
return task.envs[constants.USER_ID_ENV_VAR]
|
|
5851
5992
|
return None
|
|
5852
5993
|
|
|
5994
|
+
def _get_task_codegen_class(
|
|
5995
|
+
self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
|
|
5996
|
+
"""Returns the appropriate TaskCodeGen for the given handle."""
|
|
5997
|
+
if isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
5998
|
+
assert (handle.cached_cluster_info
|
|
5999
|
+
is not None), ('cached_cluster_info must be set')
|
|
6000
|
+
head_instance = handle.cached_cluster_info.get_head_instance()
|
|
6001
|
+
assert (head_instance is not None), (
|
|
6002
|
+
'Head instance not found in cached cluster info')
|
|
6003
|
+
slurm_job_id = head_instance.tags.get('job_id')
|
|
6004
|
+
assert (slurm_job_id
|
|
6005
|
+
is not None), ('job_id tag not found in head instance')
|
|
6006
|
+
return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
|
|
6007
|
+
else:
|
|
6008
|
+
return task_codegen.RayCodeGen()
|
|
6009
|
+
|
|
5853
6010
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5854
6011
|
task: task_lib.Task, job_id: int,
|
|
5855
6012
|
remote_log_dir: str) -> None:
|
|
@@ -5862,15 +6019,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5862
6019
|
|
|
5863
6020
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5864
6021
|
|
|
5865
|
-
codegen =
|
|
6022
|
+
codegen = self._get_task_codegen_class(handle)
|
|
6023
|
+
|
|
5866
6024
|
codegen.add_prologue(job_id)
|
|
5867
6025
|
codegen.add_setup(
|
|
5868
6026
|
1,
|
|
5869
6027
|
resources_dict,
|
|
5870
6028
|
stable_cluster_internal_ips=internal_ips,
|
|
5871
6029
|
env_vars=task_env_vars,
|
|
6030
|
+
log_dir=log_dir,
|
|
5872
6031
|
setup_cmd=self._setup_cmd,
|
|
5873
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5874
6032
|
)
|
|
5875
6033
|
|
|
5876
6034
|
codegen.add_task(
|
|
@@ -5907,15 +6065,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5907
6065
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
5908
6066
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5909
6067
|
|
|
5910
|
-
codegen =
|
|
6068
|
+
codegen = self._get_task_codegen_class(handle)
|
|
6069
|
+
|
|
5911
6070
|
codegen.add_prologue(job_id)
|
|
5912
6071
|
codegen.add_setup(
|
|
5913
6072
|
num_actual_nodes,
|
|
5914
6073
|
resources_dict,
|
|
5915
6074
|
stable_cluster_internal_ips=internal_ips,
|
|
5916
6075
|
env_vars=task_env_vars,
|
|
6076
|
+
log_dir=log_dir,
|
|
5917
6077
|
setup_cmd=self._setup_cmd,
|
|
5918
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5919
6078
|
)
|
|
5920
6079
|
|
|
5921
6080
|
codegen.add_task(
|