skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -192,18 +192,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
|
192
192
|
pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
|
|
193
193
|
'monkey_patches' / 'monkey_patch_ray_up.py')
|
|
194
194
|
|
|
195
|
-
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
196
|
-
# executed with /bin/sh should be less than 128KB.
|
|
197
|
-
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
|
|
198
|
-
#
|
|
199
|
-
# If a user have very long run or setup commands, the generated command may
|
|
200
|
-
# exceed the limit, as we directly include scripts in job submission commands.
|
|
201
|
-
# If the command is too long, we instead write it to a file, rsync and execute
|
|
202
|
-
# it.
|
|
203
|
-
#
|
|
204
|
-
# We use 100KB as a threshold to be safe for other arguments that
|
|
205
|
-
# might be added during ssh.
|
|
206
|
-
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
207
195
|
_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
|
|
208
196
|
('too long', 255),
|
|
209
197
|
('request-uri too large', 1),
|
|
@@ -218,18 +206,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
|
|
|
218
206
|
_CLUSTER_LOCK_TIMEOUT = 5.0
|
|
219
207
|
|
|
220
208
|
|
|
221
|
-
def _is_command_length_over_limit(command: str) -> bool:
|
|
222
|
-
"""Check if the length of the command exceeds the limit.
|
|
223
|
-
|
|
224
|
-
We calculate the length of the command after quoting the command twice as
|
|
225
|
-
when it is executed by the CommandRunner, the command will be quoted twice
|
|
226
|
-
to ensure the correctness, which will add significant length to the command.
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
quoted_length = len(shlex.quote(shlex.quote(command)))
|
|
230
|
-
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
231
|
-
|
|
232
|
-
|
|
233
209
|
def _is_message_too_long(returncode: int,
|
|
234
210
|
output: Optional[str] = None,
|
|
235
211
|
file_path: Optional[str] = None) -> bool:
|
|
@@ -294,6 +270,7 @@ def _get_cluster_config_template(cloud):
|
|
|
294
270
|
clouds.Lambda: 'lambda-ray.yml.j2',
|
|
295
271
|
clouds.IBM: 'ibm-ray.yml.j2',
|
|
296
272
|
clouds.SCP: 'scp-ray.yml.j2',
|
|
273
|
+
clouds.Slurm: 'slurm-ray.yml.j2',
|
|
297
274
|
clouds.OCI: 'oci-ray.yml.j2',
|
|
298
275
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
|
299
276
|
clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
|
|
@@ -2516,7 +2493,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2516
2493
|
@property
|
|
2517
2494
|
def is_grpc_enabled_with_flag(self) -> bool:
|
|
2518
2495
|
"""Returns whether this handle has gRPC enabled and gRPC flag is set."""
|
|
2519
|
-
return env_options.Options.ENABLE_GRPC.get() and
|
|
2496
|
+
return (env_options.Options.ENABLE_GRPC.get() and
|
|
2497
|
+
self.is_grpc_enabled and
|
|
2498
|
+
not isinstance(self.launched_resources.cloud, clouds.Slurm))
|
|
2520
2499
|
|
|
2521
2500
|
def __getstate__(self):
|
|
2522
2501
|
state = self.__dict__.copy()
|
|
@@ -3596,6 +3575,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3596
3575
|
|
|
3597
3576
|
def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
|
|
3598
3577
|
detach_setup: bool) -> None:
|
|
3578
|
+
|
|
3599
3579
|
start = time.time()
|
|
3600
3580
|
|
|
3601
3581
|
if task.setup is None:
|
|
@@ -3647,7 +3627,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3647
3627
|
_dump_final_script(setup_script,
|
|
3648
3628
|
constants.PERSISTENT_SETUP_SCRIPT_PATH)
|
|
3649
3629
|
|
|
3650
|
-
if detach_setup or
|
|
3630
|
+
if (detach_setup or
|
|
3631
|
+
backend_utils.is_command_length_over_limit(encoded_script)):
|
|
3651
3632
|
_dump_final_script(setup_script)
|
|
3652
3633
|
create_script_code = 'true'
|
|
3653
3634
|
else:
|
|
@@ -3804,7 +3785,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3804
3785
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3805
3786
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3806
3787
|
|
|
3807
|
-
# Should also be ealier than
|
|
3788
|
+
# Should also be ealier than is_command_length_over_limit
|
|
3808
3789
|
# Same reason as in _setup
|
|
3809
3790
|
if self._dump_final_script:
|
|
3810
3791
|
_dump_code_to_file(job_submit_cmd,
|
|
@@ -3837,7 +3818,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3837
3818
|
tasks=managed_job_tasks,
|
|
3838
3819
|
user_id=managed_job_user_id)
|
|
3839
3820
|
|
|
3840
|
-
if
|
|
3821
|
+
if backend_utils.is_command_length_over_limit(codegen):
|
|
3841
3822
|
_dump_code_to_file(codegen)
|
|
3842
3823
|
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
3843
3824
|
job_id=job_id,
|
|
@@ -3859,7 +3840,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3859
3840
|
use_legacy = True
|
|
3860
3841
|
|
|
3861
3842
|
if use_legacy:
|
|
3862
|
-
if
|
|
3843
|
+
if backend_utils.is_command_length_over_limit(job_submit_cmd):
|
|
3863
3844
|
_dump_code_to_file(codegen)
|
|
3864
3845
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
3865
3846
|
|
|
@@ -5850,6 +5831,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5850
5831
|
return task.envs[constants.USER_ID_ENV_VAR]
|
|
5851
5832
|
return None
|
|
5852
5833
|
|
|
5834
|
+
def _get_task_codegen_class(
|
|
5835
|
+
self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
|
|
5836
|
+
"""Returns the appropriate TaskCodeGen for the given handle."""
|
|
5837
|
+
if isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
5838
|
+
assert (handle.cached_cluster_info
|
|
5839
|
+
is not None), ('cached_cluster_info must be set')
|
|
5840
|
+
head_instance = handle.cached_cluster_info.get_head_instance()
|
|
5841
|
+
assert (head_instance is not None), (
|
|
5842
|
+
'Head instance not found in cached cluster info')
|
|
5843
|
+
slurm_job_id = head_instance.tags.get('job_id')
|
|
5844
|
+
assert (slurm_job_id
|
|
5845
|
+
is not None), ('job_id tag not found in head instance')
|
|
5846
|
+
return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
|
|
5847
|
+
else:
|
|
5848
|
+
return task_codegen.RayCodeGen()
|
|
5849
|
+
|
|
5853
5850
|
def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
|
|
5854
5851
|
task: task_lib.Task, job_id: int,
|
|
5855
5852
|
remote_log_dir: str) -> None:
|
|
@@ -5862,15 +5859,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5862
5859
|
|
|
5863
5860
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5864
5861
|
|
|
5865
|
-
codegen =
|
|
5862
|
+
codegen = self._get_task_codegen_class(handle)
|
|
5863
|
+
|
|
5866
5864
|
codegen.add_prologue(job_id)
|
|
5867
5865
|
codegen.add_setup(
|
|
5868
5866
|
1,
|
|
5869
5867
|
resources_dict,
|
|
5870
5868
|
stable_cluster_internal_ips=internal_ips,
|
|
5871
5869
|
env_vars=task_env_vars,
|
|
5870
|
+
log_dir=log_dir,
|
|
5872
5871
|
setup_cmd=self._setup_cmd,
|
|
5873
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5874
5872
|
)
|
|
5875
5873
|
|
|
5876
5874
|
codegen.add_task(
|
|
@@ -5907,15 +5905,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5907
5905
|
num_actual_nodes = task.num_nodes * handle.num_ips_per_node
|
|
5908
5906
|
task_env_vars = self._get_task_env_vars(task, job_id, handle)
|
|
5909
5907
|
|
|
5910
|
-
codegen =
|
|
5908
|
+
codegen = self._get_task_codegen_class(handle)
|
|
5909
|
+
|
|
5911
5910
|
codegen.add_prologue(job_id)
|
|
5912
5911
|
codegen.add_setup(
|
|
5913
5912
|
num_actual_nodes,
|
|
5914
5913
|
resources_dict,
|
|
5915
5914
|
stable_cluster_internal_ips=internal_ips,
|
|
5916
5915
|
env_vars=task_env_vars,
|
|
5916
|
+
log_dir=log_dir,
|
|
5917
5917
|
setup_cmd=self._setup_cmd,
|
|
5918
|
-
setup_log_path=os.path.join(log_dir, 'setup.log'),
|
|
5919
5918
|
)
|
|
5920
5919
|
|
|
5921
5920
|
codegen.add_task(
|
sky/backends/task_codegen.py
CHANGED
|
@@ -4,6 +4,7 @@ import copy
|
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import math
|
|
7
|
+
import os
|
|
7
8
|
import textwrap
|
|
8
9
|
from typing import Dict, List, Optional, Tuple
|
|
9
10
|
|
|
@@ -181,8 +182,8 @@ class TaskCodeGen:
|
|
|
181
182
|
resources_dict: Dict[str, float],
|
|
182
183
|
stable_cluster_internal_ips: List[str],
|
|
183
184
|
env_vars: Dict[str, str],
|
|
185
|
+
log_dir: str,
|
|
184
186
|
setup_cmd: Optional[str] = None,
|
|
185
|
-
setup_log_path: Optional[str] = None,
|
|
186
187
|
) -> None:
|
|
187
188
|
"""Generates code to set up the task on each node.
|
|
188
189
|
|
|
@@ -379,13 +380,15 @@ class RayCodeGen(TaskCodeGen):
|
|
|
379
380
|
resources_dict: Dict[str, float],
|
|
380
381
|
stable_cluster_internal_ips: List[str],
|
|
381
382
|
env_vars: Dict[str, str],
|
|
383
|
+
log_dir: str,
|
|
382
384
|
setup_cmd: Optional[str] = None,
|
|
383
|
-
setup_log_path: Optional[str] = None,
|
|
384
385
|
) -> None:
|
|
385
386
|
assert self._has_prologue, ('Call add_prologue() before '
|
|
386
387
|
'add_setup().')
|
|
387
388
|
self._has_setup = True
|
|
388
389
|
|
|
390
|
+
setup_log_path = os.path.join(log_dir, 'setup.log')
|
|
391
|
+
|
|
389
392
|
bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
|
|
390
393
|
# Set CPU to avoid ray hanging the resources allocation
|
|
391
394
|
# for remote functions, since the task will request 1 CPU
|
|
@@ -631,3 +634,338 @@ class RayCodeGen(TaskCodeGen):
|
|
|
631
634
|
"""Generates code that waits for all tasks, then exits."""
|
|
632
635
|
self._code.append('returncodes, _ = get_or_fail(futures, pg)')
|
|
633
636
|
super().add_epilogue()
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
class SlurmCodeGen(TaskCodeGen):
|
|
640
|
+
"""Code generator for task execution on Slurm using native srun."""
|
|
641
|
+
|
|
642
|
+
def __init__(self, slurm_job_id: str):
|
|
643
|
+
"""Initialize SlurmCodeGen
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
|
|
647
|
+
"""
|
|
648
|
+
super().__init__()
|
|
649
|
+
self._slurm_job_id = slurm_job_id
|
|
650
|
+
|
|
651
|
+
def add_prologue(self, job_id: int) -> None:
|
|
652
|
+
assert not self._has_prologue, 'add_prologue() called twice?'
|
|
653
|
+
self._has_prologue = True
|
|
654
|
+
self.job_id = job_id
|
|
655
|
+
|
|
656
|
+
self._add_common_imports()
|
|
657
|
+
|
|
658
|
+
self._code.append(
|
|
659
|
+
textwrap.dedent("""\
|
|
660
|
+
import colorama
|
|
661
|
+
import copy
|
|
662
|
+
import json
|
|
663
|
+
import multiprocessing
|
|
664
|
+
import signal
|
|
665
|
+
import threading
|
|
666
|
+
from sky.backends import backend_utils
|
|
667
|
+
"""))
|
|
668
|
+
self._add_skylet_imports()
|
|
669
|
+
|
|
670
|
+
self._add_constants()
|
|
671
|
+
|
|
672
|
+
self._add_logging_functions()
|
|
673
|
+
|
|
674
|
+
self._code.append(
|
|
675
|
+
textwrap.dedent(f"""\
|
|
676
|
+
def _cancel_slurm_job_steps():
|
|
677
|
+
slurm_job_id = {self._slurm_job_id!r}
|
|
678
|
+
assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
|
|
679
|
+
try:
|
|
680
|
+
# Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
|
|
681
|
+
# Output format: "JOBID.STEPID STEPNAME"
|
|
682
|
+
# TODO(kevin): This assumes that compute node is able
|
|
683
|
+
# to run client commands against the controller.
|
|
684
|
+
# Validate this assumption.
|
|
685
|
+
result = subprocess.run(
|
|
686
|
+
['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
|
|
687
|
+
capture_output=True, text=True, check=False)
|
|
688
|
+
for line in result.stdout.strip().split('\\n'):
|
|
689
|
+
if not line:
|
|
690
|
+
continue
|
|
691
|
+
parts = line.split()
|
|
692
|
+
assert len(parts) >= 2, 'Expected at least 2 parts'
|
|
693
|
+
step_id, step_name = parts[0], parts[1]
|
|
694
|
+
if step_name == f'sky-{self.job_id}':
|
|
695
|
+
subprocess.run(['scancel', step_id],
|
|
696
|
+
check=False, capture_output=True)
|
|
697
|
+
except Exception as e:
|
|
698
|
+
print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
|
|
699
|
+
pass
|
|
700
|
+
|
|
701
|
+
def _slurm_cleanup_handler(signum, _frame):
|
|
702
|
+
_cancel_slurm_job_steps()
|
|
703
|
+
# Re-raise to let default handler terminate.
|
|
704
|
+
signal.signal(signum, signal.SIG_DFL)
|
|
705
|
+
os.kill(os.getpid(), signum)
|
|
706
|
+
|
|
707
|
+
signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
|
|
708
|
+
"""))
|
|
709
|
+
|
|
710
|
+
self._code += [
|
|
711
|
+
'autostop_lib.set_last_active_time_to_now()',
|
|
712
|
+
f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
|
|
713
|
+
]
|
|
714
|
+
|
|
715
|
+
self._setup_cmd: Optional[str] = None
|
|
716
|
+
self._setup_envs: Optional[Dict[str, str]] = None
|
|
717
|
+
self._setup_log_dir: Optional[str] = None
|
|
718
|
+
self._setup_num_nodes: Optional[int] = None
|
|
719
|
+
|
|
720
|
+
def add_setup(
|
|
721
|
+
self,
|
|
722
|
+
num_nodes: int,
|
|
723
|
+
resources_dict: Dict[str, float],
|
|
724
|
+
stable_cluster_internal_ips: List[str],
|
|
725
|
+
env_vars: Dict[str, str],
|
|
726
|
+
log_dir: str,
|
|
727
|
+
setup_cmd: Optional[str] = None,
|
|
728
|
+
) -> None:
|
|
729
|
+
assert self._has_prologue, ('Call add_prologue() before add_setup().')
|
|
730
|
+
self._has_setup = True
|
|
731
|
+
self._cluster_num_nodes = len(stable_cluster_internal_ips)
|
|
732
|
+
self._stable_cluster_ips = stable_cluster_internal_ips
|
|
733
|
+
|
|
734
|
+
self._add_waiting_for_resources_msg(num_nodes)
|
|
735
|
+
|
|
736
|
+
# Store setup information for use in add_task().
|
|
737
|
+
if setup_cmd is not None:
|
|
738
|
+
setup_envs = env_vars.copy()
|
|
739
|
+
setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
|
|
740
|
+
self._setup_cmd = setup_cmd
|
|
741
|
+
self._setup_envs = setup_envs
|
|
742
|
+
self._setup_log_dir = log_dir
|
|
743
|
+
self._setup_num_nodes = num_nodes
|
|
744
|
+
|
|
745
|
+
def add_task(
|
|
746
|
+
self,
|
|
747
|
+
num_nodes: int,
|
|
748
|
+
bash_script: Optional[str],
|
|
749
|
+
task_name: Optional[str],
|
|
750
|
+
resources_dict: Dict[str, float],
|
|
751
|
+
log_dir: str,
|
|
752
|
+
env_vars: Optional[Dict[str, str]] = None,
|
|
753
|
+
) -> None:
|
|
754
|
+
"""Generates code for invoking a bash command
|
|
755
|
+
using srun within sbatch allocation.
|
|
756
|
+
"""
|
|
757
|
+
assert self._has_setup, 'Call add_setup() before add_task().'
|
|
758
|
+
env_vars = env_vars or {}
|
|
759
|
+
task_name = task_name if task_name is not None else 'task'
|
|
760
|
+
|
|
761
|
+
acc_name, acc_count = self._get_accelerator_details(resources_dict)
|
|
762
|
+
num_gpus = 0
|
|
763
|
+
if (acc_name is not None and
|
|
764
|
+
not accelerator_registry.is_schedulable_non_gpu_accelerator(
|
|
765
|
+
acc_name)):
|
|
766
|
+
num_gpus = int(math.ceil(acc_count))
|
|
767
|
+
|
|
768
|
+
# Slurm does not support fractional CPUs.
|
|
769
|
+
task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
|
|
770
|
+
|
|
771
|
+
sky_env_vars_dict_str = [
|
|
772
|
+
textwrap.dedent(f"""\
|
|
773
|
+
sky_env_vars_dict = {{}}
|
|
774
|
+
sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
|
|
775
|
+
""")
|
|
776
|
+
]
|
|
777
|
+
|
|
778
|
+
if env_vars:
|
|
779
|
+
sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
|
|
780
|
+
for k, v in env_vars.items())
|
|
781
|
+
sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
|
|
782
|
+
|
|
783
|
+
rclone_flush_script = self._get_rclone_flush_script()
|
|
784
|
+
streaming_msg = self._get_job_started_msg()
|
|
785
|
+
has_setup_cmd = self._setup_cmd is not None
|
|
786
|
+
|
|
787
|
+
self._code += [
|
|
788
|
+
sky_env_vars_dict_str,
|
|
789
|
+
textwrap.dedent(f"""\
|
|
790
|
+
script = {bash_script!r}
|
|
791
|
+
if script is None:
|
|
792
|
+
script = ''
|
|
793
|
+
rclone_flush_script = {rclone_flush_script!r}
|
|
794
|
+
|
|
795
|
+
if script or {has_setup_cmd!r}:
|
|
796
|
+
script += rclone_flush_script
|
|
797
|
+
sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
|
|
798
|
+
|
|
799
|
+
# Signal files for setup/run synchronization:
|
|
800
|
+
# 1. alloc_signal_file: srun has acquired allocation
|
|
801
|
+
# 2. setup_done_signal_file: Driver has finished setup, run can proceed
|
|
802
|
+
#
|
|
803
|
+
# Signal files are stored in home directory, which is
|
|
804
|
+
# assumed to be on a shared NFS mount accessible by all nodes.
|
|
805
|
+
# To support clusters with non-NFS home directories, we would
|
|
806
|
+
# need to let users specify an NFS-backed "working directory"
|
|
807
|
+
# or use a different coordination mechanism.
|
|
808
|
+
alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
|
|
809
|
+
alloc_signal_file = os.path.expanduser(alloc_signal_file)
|
|
810
|
+
setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
|
|
811
|
+
setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
|
|
812
|
+
|
|
813
|
+
# Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
|
|
814
|
+
gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
|
|
815
|
+
|
|
816
|
+
def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
|
|
817
|
+
task_name=None, is_setup=False,
|
|
818
|
+
alloc_signal=None, setup_done_signal=None):
|
|
819
|
+
env_vars_json = json.dumps(env_vars_dict)
|
|
820
|
+
|
|
821
|
+
log_dir = shlex.quote(log_dir)
|
|
822
|
+
env_vars = shlex.quote(env_vars_json)
|
|
823
|
+
cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
|
|
824
|
+
|
|
825
|
+
runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
|
|
826
|
+
|
|
827
|
+
if task_name is not None:
|
|
828
|
+
runner_args += f' --task-name={{shlex.quote(task_name)}}'
|
|
829
|
+
|
|
830
|
+
if is_setup:
|
|
831
|
+
runner_args += ' --is-setup'
|
|
832
|
+
|
|
833
|
+
if alloc_signal is not None:
|
|
834
|
+
runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
|
|
835
|
+
|
|
836
|
+
if setup_done_signal is not None:
|
|
837
|
+
runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
|
|
838
|
+
|
|
839
|
+
script_path = None
|
|
840
|
+
prefix = 'sky_setup_' if is_setup else 'sky_task_'
|
|
841
|
+
if backend_utils.is_command_length_over_limit(user_script):
|
|
842
|
+
with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
|
|
843
|
+
f.write(user_script)
|
|
844
|
+
script_path = f.name
|
|
845
|
+
runner_args += f' --script-path={{shlex.quote(script_path)}}'
|
|
846
|
+
else:
|
|
847
|
+
runner_args += f' --script={{shlex.quote(user_script)}}'
|
|
848
|
+
|
|
849
|
+
# Use /usr/bin/env explicitly to work around a Slurm quirk where
|
|
850
|
+
# srun's execvp() doesn't check execute permissions, failing when
|
|
851
|
+
# $HOME/.local/bin/env (non-executable, from uv installation)
|
|
852
|
+
# shadows /usr/bin/env.
|
|
853
|
+
job_suffix = '-setup' if is_setup else ''
|
|
854
|
+
srun_cmd = (
|
|
855
|
+
f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
|
|
856
|
+
f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
|
|
857
|
+
f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
|
|
858
|
+
)
|
|
859
|
+
return srun_cmd, script_path
|
|
860
|
+
|
|
861
|
+
def run_thread_func():
|
|
862
|
+
# This blocks until Slurm allocates resources (--exclusive)
|
|
863
|
+
# --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
|
|
864
|
+
run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
|
|
865
|
+
srun_cmd, task_script_path = build_task_runner_cmd(
|
|
866
|
+
script, run_flags, {log_dir!r}, sky_env_vars_dict,
|
|
867
|
+
task_name={task_name!r},
|
|
868
|
+
alloc_signal=alloc_signal_file,
|
|
869
|
+
setup_done_signal=setup_done_signal_file
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
proc = subprocess.Popen(srun_cmd, shell=True,
|
|
873
|
+
stdout=subprocess.PIPE,
|
|
874
|
+
stderr=subprocess.STDOUT,
|
|
875
|
+
text=True)
|
|
876
|
+
for line in proc.stdout:
|
|
877
|
+
print(line, end='', flush=True)
|
|
878
|
+
proc.wait()
|
|
879
|
+
|
|
880
|
+
if task_script_path is not None:
|
|
881
|
+
os.remove(task_script_path)
|
|
882
|
+
return {{'return_code': proc.returncode, 'pid': proc.pid}}
|
|
883
|
+
|
|
884
|
+
run_thread_result = {{'result': None}}
|
|
885
|
+
def run_thread_wrapper():
|
|
886
|
+
run_thread_result['result'] = run_thread_func()
|
|
887
|
+
|
|
888
|
+
run_thread = threading.Thread(target=run_thread_wrapper)
|
|
889
|
+
run_thread.start()
|
|
890
|
+
|
|
891
|
+
# Wait for allocation signal from inside srun
|
|
892
|
+
while not os.path.exists(alloc_signal_file):
|
|
893
|
+
if not run_thread.is_alive():
|
|
894
|
+
# srun failed before creating the signal file.
|
|
895
|
+
run_thread.join()
|
|
896
|
+
result = run_thread_result['result']
|
|
897
|
+
returncode = int(result.get('return_code', 1))
|
|
898
|
+
pid = result.get('pid', os.getpid())
|
|
899
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
|
|
900
|
+
msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
901
|
+
print(msg, flush=True)
|
|
902
|
+
returncodes = [returncode]
|
|
903
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
904
|
+
sys.exit(1)
|
|
905
|
+
time.sleep(0.1)
|
|
906
|
+
|
|
907
|
+
print({streaming_msg!r}, flush=True)
|
|
908
|
+
|
|
909
|
+
if {has_setup_cmd!r}:
|
|
910
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
|
|
911
|
+
|
|
912
|
+
# The schedule_step should be called after the job status is set to
|
|
913
|
+
# non-PENDING, otherwise, the scheduler will think the current job
|
|
914
|
+
# is not submitted yet, and skip the scheduling step.
|
|
915
|
+
job_lib.scheduler.schedule_step()
|
|
916
|
+
|
|
917
|
+
# --overlap as we have already secured allocation with the srun for the run section,
|
|
918
|
+
# and otherwise this srun would get blocked and deadlock.
|
|
919
|
+
setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
|
|
920
|
+
setup_srun, setup_script_path = build_task_runner_cmd(
|
|
921
|
+
{self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
|
|
922
|
+
is_setup=True
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Run setup srun directly, streaming output to driver stdout
|
|
926
|
+
setup_proc = subprocess.Popen(setup_srun, shell=True,
|
|
927
|
+
stdout=subprocess.PIPE,
|
|
928
|
+
stderr=subprocess.STDOUT,
|
|
929
|
+
text=True)
|
|
930
|
+
for line in setup_proc.stdout:
|
|
931
|
+
print(line, end='', flush=True)
|
|
932
|
+
setup_proc.wait()
|
|
933
|
+
|
|
934
|
+
if setup_script_path is not None:
|
|
935
|
+
os.remove(setup_script_path)
|
|
936
|
+
|
|
937
|
+
setup_returncode = setup_proc.returncode
|
|
938
|
+
if setup_returncode != 0:
|
|
939
|
+
setup_pid = setup_proc.pid
|
|
940
|
+
msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
|
|
941
|
+
msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
|
|
942
|
+
print(msg, flush=True)
|
|
943
|
+
job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
|
|
944
|
+
# Cancel the srun spawned by run_thread_func.
|
|
945
|
+
_cancel_slurm_job_steps()
|
|
946
|
+
sys.exit(1)
|
|
947
|
+
|
|
948
|
+
job_lib.set_job_started({self.job_id!r})
|
|
949
|
+
if not {has_setup_cmd!r}:
|
|
950
|
+
# Need to call schedule_step() to make sure the scheduler
|
|
951
|
+
# schedule the next pending job.
|
|
952
|
+
job_lib.scheduler.schedule_step()
|
|
953
|
+
|
|
954
|
+
# Signal run thread to proceed.
|
|
955
|
+
pathlib.Path(setup_done_signal_file).touch()
|
|
956
|
+
|
|
957
|
+
# Wait for run thread to complete.
|
|
958
|
+
run_thread.join()
|
|
959
|
+
result = run_thread_result['result']
|
|
960
|
+
|
|
961
|
+
# Cleanup signal files
|
|
962
|
+
if os.path.exists(alloc_signal_file):
|
|
963
|
+
os.remove(alloc_signal_file)
|
|
964
|
+
if os.path.exists(setup_done_signal_file):
|
|
965
|
+
os.remove(setup_done_signal_file)
|
|
966
|
+
|
|
967
|
+
returncodes = [int(result.get('return_code', 1))]
|
|
968
|
+
else:
|
|
969
|
+
returncodes = [0]
|
|
970
|
+
"""),
|
|
971
|
+
]
|
sky/catalog/__init__.py
CHANGED
|
@@ -127,12 +127,9 @@ def list_accelerator_realtime(
|
|
|
127
127
|
case_sensitive: bool = True,
|
|
128
128
|
) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
|
|
129
129
|
"""Lists all accelerators offered by Sky with their realtime availability.
|
|
130
|
-
|
|
131
130
|
Realtime availability is the total number of accelerators in the cluster
|
|
132
131
|
and number of accelerators available at the time of the call.
|
|
133
|
-
|
|
134
132
|
Used for fixed size cluster settings, such as Kubernetes.
|
|
135
|
-
|
|
136
133
|
Returns:
|
|
137
134
|
A tuple of three dictionaries mapping canonical accelerator names to:
|
|
138
135
|
- A list of available counts. (e.g., [1, 2, 4])
|
|
@@ -204,6 +204,9 @@ def _list_accelerators(
|
|
|
204
204
|
min_quantity_filter = quantity_filter if quantity_filter else 1
|
|
205
205
|
|
|
206
206
|
for node in nodes:
|
|
207
|
+
# Check if node is ready
|
|
208
|
+
node_is_ready = node.is_ready()
|
|
209
|
+
|
|
207
210
|
for key in keys:
|
|
208
211
|
if key in node.metadata.labels:
|
|
209
212
|
accelerator_name = lf.get_accelerator_from_label_value(
|
|
@@ -260,6 +263,15 @@ def _list_accelerators(
|
|
|
260
263
|
total_accelerators_capacity[
|
|
261
264
|
accelerator_name] += quantized_count
|
|
262
265
|
|
|
266
|
+
# Initialize the total_accelerators_available to make sure the
|
|
267
|
+
# key exists in the dictionary.
|
|
268
|
+
total_accelerators_available[accelerator_name] = (
|
|
269
|
+
total_accelerators_available.get(accelerator_name, 0))
|
|
270
|
+
|
|
271
|
+
# Skip availability counting for not-ready nodes
|
|
272
|
+
if not node_is_ready:
|
|
273
|
+
continue
|
|
274
|
+
|
|
263
275
|
if error_on_get_allocated_gpu_qty_by_node:
|
|
264
276
|
# If we can't get the allocated GPU quantity by each node,
|
|
265
277
|
# we can't get the GPU usage.
|
|
@@ -268,10 +280,6 @@ def _list_accelerators(
|
|
|
268
280
|
|
|
269
281
|
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
270
282
|
accelerators_available = accelerator_count - allocated_qty
|
|
271
|
-
# Initialize the total_accelerators_available to make sure the
|
|
272
|
-
# key exists in the dictionary.
|
|
273
|
-
total_accelerators_available[accelerator_name] = (
|
|
274
|
-
total_accelerators_available.get(accelerator_name, 0))
|
|
275
283
|
|
|
276
284
|
if accelerators_available >= min_quantity_filter:
|
|
277
285
|
quantized_availability = min_quantity_filter * (
|