skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
@@ -192,18 +192,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
192
192
  pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
193
193
  'monkey_patches' / 'monkey_patch_ray_up.py')
194
194
 
195
- # The maximum size of a command line arguments is 128 KB, i.e. the command
196
- # executed with /bin/sh should be less than 128KB.
197
- # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
198
- #
199
- # If a user have very long run or setup commands, the generated command may
200
- # exceed the limit, as we directly include scripts in job submission commands.
201
- # If the command is too long, we instead write it to a file, rsync and execute
202
- # it.
203
- #
204
- # We use 100KB as a threshold to be safe for other arguments that
205
- # might be added during ssh.
206
- _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
207
195
  _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
208
196
  ('too long', 255),
209
197
  ('request-uri too large', 1),
@@ -218,18 +206,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
218
206
  _CLUSTER_LOCK_TIMEOUT = 5.0
219
207
 
220
208
 
221
- def _is_command_length_over_limit(command: str) -> bool:
222
- """Check if the length of the command exceeds the limit.
223
-
224
- We calculate the length of the command after quoting the command twice as
225
- when it is executed by the CommandRunner, the command will be quoted twice
226
- to ensure the correctness, which will add significant length to the command.
227
- """
228
-
229
- quoted_length = len(shlex.quote(shlex.quote(command)))
230
- return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
231
-
232
-
233
209
  def _is_message_too_long(returncode: int,
234
210
  output: Optional[str] = None,
235
211
  file_path: Optional[str] = None) -> bool:
@@ -294,6 +270,7 @@ def _get_cluster_config_template(cloud):
294
270
  clouds.Lambda: 'lambda-ray.yml.j2',
295
271
  clouds.IBM: 'ibm-ray.yml.j2',
296
272
  clouds.SCP: 'scp-ray.yml.j2',
273
+ clouds.Slurm: 'slurm-ray.yml.j2',
297
274
  clouds.OCI: 'oci-ray.yml.j2',
298
275
  clouds.Paperspace: 'paperspace-ray.yml.j2',
299
276
  clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
@@ -2516,7 +2493,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2516
2493
  @property
2517
2494
  def is_grpc_enabled_with_flag(self) -> bool:
2518
2495
  """Returns whether this handle has gRPC enabled and gRPC flag is set."""
2519
- return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
2496
+ return (env_options.Options.ENABLE_GRPC.get() and
2497
+ self.is_grpc_enabled and
2498
+ not isinstance(self.launched_resources.cloud, clouds.Slurm))
2520
2499
 
2521
2500
  def __getstate__(self):
2522
2501
  state = self.__dict__.copy()
@@ -3596,6 +3575,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3596
3575
 
3597
3576
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3598
3577
  detach_setup: bool) -> None:
3578
+
3599
3579
  start = time.time()
3600
3580
 
3601
3581
  if task.setup is None:
@@ -3647,7 +3627,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3647
3627
  _dump_final_script(setup_script,
3648
3628
  constants.PERSISTENT_SETUP_SCRIPT_PATH)
3649
3629
 
3650
- if detach_setup or _is_command_length_over_limit(encoded_script):
3630
+ if (detach_setup or
3631
+ backend_utils.is_command_length_over_limit(encoded_script)):
3651
3632
  _dump_final_script(setup_script)
3652
3633
  create_script_code = 'true'
3653
3634
  else:
@@ -3804,7 +3785,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3804
3785
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3805
3786
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3806
3787
 
3807
- # Should also be ealier than _is_command_length_over_limit
3788
+ # Should also be ealier than is_command_length_over_limit
3808
3789
  # Same reason as in _setup
3809
3790
  if self._dump_final_script:
3810
3791
  _dump_code_to_file(job_submit_cmd,
@@ -3837,7 +3818,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3837
3818
  tasks=managed_job_tasks,
3838
3819
  user_id=managed_job_user_id)
3839
3820
 
3840
- if _is_command_length_over_limit(codegen):
3821
+ if backend_utils.is_command_length_over_limit(codegen):
3841
3822
  _dump_code_to_file(codegen)
3842
3823
  queue_job_request = jobsv1_pb2.QueueJobRequest(
3843
3824
  job_id=job_id,
@@ -3859,7 +3840,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3859
3840
  use_legacy = True
3860
3841
 
3861
3842
  if use_legacy:
3862
- if _is_command_length_over_limit(job_submit_cmd):
3843
+ if backend_utils.is_command_length_over_limit(job_submit_cmd):
3863
3844
  _dump_code_to_file(codegen)
3864
3845
  job_submit_cmd = f'{mkdir_code} && {code}'
3865
3846
 
@@ -5850,6 +5831,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5850
5831
  return task.envs[constants.USER_ID_ENV_VAR]
5851
5832
  return None
5852
5833
 
5834
+ def _get_task_codegen_class(
5835
+ self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
5836
+ """Returns the appropriate TaskCodeGen for the given handle."""
5837
+ if isinstance(handle.launched_resources.cloud, clouds.Slurm):
5838
+ assert (handle.cached_cluster_info
5839
+ is not None), ('cached_cluster_info must be set')
5840
+ head_instance = handle.cached_cluster_info.get_head_instance()
5841
+ assert (head_instance is not None), (
5842
+ 'Head instance not found in cached cluster info')
5843
+ slurm_job_id = head_instance.tags.get('job_id')
5844
+ assert (slurm_job_id
5845
+ is not None), ('job_id tag not found in head instance')
5846
+ return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
5847
+ else:
5848
+ return task_codegen.RayCodeGen()
5849
+
5853
5850
  def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
5854
5851
  task: task_lib.Task, job_id: int,
5855
5852
  remote_log_dir: str) -> None:
@@ -5862,15 +5859,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5862
5859
 
5863
5860
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5864
5861
 
5865
- codegen = task_codegen.RayCodeGen()
5862
+ codegen = self._get_task_codegen_class(handle)
5863
+
5866
5864
  codegen.add_prologue(job_id)
5867
5865
  codegen.add_setup(
5868
5866
  1,
5869
5867
  resources_dict,
5870
5868
  stable_cluster_internal_ips=internal_ips,
5871
5869
  env_vars=task_env_vars,
5870
+ log_dir=log_dir,
5872
5871
  setup_cmd=self._setup_cmd,
5873
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5874
5872
  )
5875
5873
 
5876
5874
  codegen.add_task(
@@ -5907,15 +5905,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
5907
5905
  num_actual_nodes = task.num_nodes * handle.num_ips_per_node
5908
5906
  task_env_vars = self._get_task_env_vars(task, job_id, handle)
5909
5907
 
5910
- codegen = task_codegen.RayCodeGen()
5908
+ codegen = self._get_task_codegen_class(handle)
5909
+
5911
5910
  codegen.add_prologue(job_id)
5912
5911
  codegen.add_setup(
5913
5912
  num_actual_nodes,
5914
5913
  resources_dict,
5915
5914
  stable_cluster_internal_ips=internal_ips,
5916
5915
  env_vars=task_env_vars,
5916
+ log_dir=log_dir,
5917
5917
  setup_cmd=self._setup_cmd,
5918
- setup_log_path=os.path.join(log_dir, 'setup.log'),
5919
5918
  )
5920
5919
 
5921
5920
  codegen.add_task(
@@ -4,6 +4,7 @@ import copy
4
4
  import inspect
5
5
  import json
6
6
  import math
7
+ import os
7
8
  import textwrap
8
9
  from typing import Dict, List, Optional, Tuple
9
10
 
@@ -181,8 +182,8 @@ class TaskCodeGen:
181
182
  resources_dict: Dict[str, float],
182
183
  stable_cluster_internal_ips: List[str],
183
184
  env_vars: Dict[str, str],
185
+ log_dir: str,
184
186
  setup_cmd: Optional[str] = None,
185
- setup_log_path: Optional[str] = None,
186
187
  ) -> None:
187
188
  """Generates code to set up the task on each node.
188
189
 
@@ -379,13 +380,15 @@ class RayCodeGen(TaskCodeGen):
379
380
  resources_dict: Dict[str, float],
380
381
  stable_cluster_internal_ips: List[str],
381
382
  env_vars: Dict[str, str],
383
+ log_dir: str,
382
384
  setup_cmd: Optional[str] = None,
383
- setup_log_path: Optional[str] = None,
384
385
  ) -> None:
385
386
  assert self._has_prologue, ('Call add_prologue() before '
386
387
  'add_setup().')
387
388
  self._has_setup = True
388
389
 
390
+ setup_log_path = os.path.join(log_dir, 'setup.log')
391
+
389
392
  bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
390
393
  # Set CPU to avoid ray hanging the resources allocation
391
394
  # for remote functions, since the task will request 1 CPU
@@ -631,3 +634,338 @@ class RayCodeGen(TaskCodeGen):
631
634
  """Generates code that waits for all tasks, then exits."""
632
635
  self._code.append('returncodes, _ = get_or_fail(futures, pg)')
633
636
  super().add_epilogue()
637
+
638
+
639
+ class SlurmCodeGen(TaskCodeGen):
640
+ """Code generator for task execution on Slurm using native srun."""
641
+
642
+ def __init__(self, slurm_job_id: str):
643
+ """Initialize SlurmCodeGen
644
+
645
+ Args:
646
+ slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
647
+ """
648
+ super().__init__()
649
+ self._slurm_job_id = slurm_job_id
650
+
651
+ def add_prologue(self, job_id: int) -> None:
652
+ assert not self._has_prologue, 'add_prologue() called twice?'
653
+ self._has_prologue = True
654
+ self.job_id = job_id
655
+
656
+ self._add_common_imports()
657
+
658
+ self._code.append(
659
+ textwrap.dedent("""\
660
+ import colorama
661
+ import copy
662
+ import json
663
+ import multiprocessing
664
+ import signal
665
+ import threading
666
+ from sky.backends import backend_utils
667
+ """))
668
+ self._add_skylet_imports()
669
+
670
+ self._add_constants()
671
+
672
+ self._add_logging_functions()
673
+
674
+ self._code.append(
675
+ textwrap.dedent(f"""\
676
+ def _cancel_slurm_job_steps():
677
+ slurm_job_id = {self._slurm_job_id!r}
678
+ assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
679
+ try:
680
+ # Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
681
+ # Output format: "JOBID.STEPID STEPNAME"
682
+ # TODO(kevin): This assumes that compute node is able
683
+ # to run client commands against the controller.
684
+ # Validate this assumption.
685
+ result = subprocess.run(
686
+ ['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
687
+ capture_output=True, text=True, check=False)
688
+ for line in result.stdout.strip().split('\\n'):
689
+ if not line:
690
+ continue
691
+ parts = line.split()
692
+ assert len(parts) >= 2, 'Expected at least 2 parts'
693
+ step_id, step_name = parts[0], parts[1]
694
+ if step_name == f'sky-{self.job_id}':
695
+ subprocess.run(['scancel', step_id],
696
+ check=False, capture_output=True)
697
+ except Exception as e:
698
+ print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
699
+ pass
700
+
701
+ def _slurm_cleanup_handler(signum, _frame):
702
+ _cancel_slurm_job_steps()
703
+ # Re-raise to let default handler terminate.
704
+ signal.signal(signum, signal.SIG_DFL)
705
+ os.kill(os.getpid(), signum)
706
+
707
+ signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
708
+ """))
709
+
710
+ self._code += [
711
+ 'autostop_lib.set_last_active_time_to_now()',
712
+ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
713
+ ]
714
+
715
+ self._setup_cmd: Optional[str] = None
716
+ self._setup_envs: Optional[Dict[str, str]] = None
717
+ self._setup_log_dir: Optional[str] = None
718
+ self._setup_num_nodes: Optional[int] = None
719
+
720
+ def add_setup(
721
+ self,
722
+ num_nodes: int,
723
+ resources_dict: Dict[str, float],
724
+ stable_cluster_internal_ips: List[str],
725
+ env_vars: Dict[str, str],
726
+ log_dir: str,
727
+ setup_cmd: Optional[str] = None,
728
+ ) -> None:
729
+ assert self._has_prologue, ('Call add_prologue() before add_setup().')
730
+ self._has_setup = True
731
+ self._cluster_num_nodes = len(stable_cluster_internal_ips)
732
+ self._stable_cluster_ips = stable_cluster_internal_ips
733
+
734
+ self._add_waiting_for_resources_msg(num_nodes)
735
+
736
+ # Store setup information for use in add_task().
737
+ if setup_cmd is not None:
738
+ setup_envs = env_vars.copy()
739
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
740
+ self._setup_cmd = setup_cmd
741
+ self._setup_envs = setup_envs
742
+ self._setup_log_dir = log_dir
743
+ self._setup_num_nodes = num_nodes
744
+
745
+ def add_task(
746
+ self,
747
+ num_nodes: int,
748
+ bash_script: Optional[str],
749
+ task_name: Optional[str],
750
+ resources_dict: Dict[str, float],
751
+ log_dir: str,
752
+ env_vars: Optional[Dict[str, str]] = None,
753
+ ) -> None:
754
+ """Generates code for invoking a bash command
755
+ using srun within sbatch allocation.
756
+ """
757
+ assert self._has_setup, 'Call add_setup() before add_task().'
758
+ env_vars = env_vars or {}
759
+ task_name = task_name if task_name is not None else 'task'
760
+
761
+ acc_name, acc_count = self._get_accelerator_details(resources_dict)
762
+ num_gpus = 0
763
+ if (acc_name is not None and
764
+ not accelerator_registry.is_schedulable_non_gpu_accelerator(
765
+ acc_name)):
766
+ num_gpus = int(math.ceil(acc_count))
767
+
768
+ # Slurm does not support fractional CPUs.
769
+ task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
770
+
771
+ sky_env_vars_dict_str = [
772
+ textwrap.dedent(f"""\
773
+ sky_env_vars_dict = {{}}
774
+ sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
775
+ """)
776
+ ]
777
+
778
+ if env_vars:
779
+ sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
780
+ for k, v in env_vars.items())
781
+ sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
782
+
783
+ rclone_flush_script = self._get_rclone_flush_script()
784
+ streaming_msg = self._get_job_started_msg()
785
+ has_setup_cmd = self._setup_cmd is not None
786
+
787
+ self._code += [
788
+ sky_env_vars_dict_str,
789
+ textwrap.dedent(f"""\
790
+ script = {bash_script!r}
791
+ if script is None:
792
+ script = ''
793
+ rclone_flush_script = {rclone_flush_script!r}
794
+
795
+ if script or {has_setup_cmd!r}:
796
+ script += rclone_flush_script
797
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
798
+
799
+ # Signal files for setup/run synchronization:
800
+ # 1. alloc_signal_file: srun has acquired allocation
801
+ # 2. setup_done_signal_file: Driver has finished setup, run can proceed
802
+ #
803
+ # Signal files are stored in home directory, which is
804
+ # assumed to be on a shared NFS mount accessible by all nodes.
805
+ # To support clusters with non-NFS home directories, we would
806
+ # need to let users specify an NFS-backed "working directory"
807
+ # or use a different coordination mechanism.
808
+ alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
809
+ alloc_signal_file = os.path.expanduser(alloc_signal_file)
810
+ setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
811
+ setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
812
+
813
+ # Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
814
+ gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
815
+
816
+ def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
817
+ task_name=None, is_setup=False,
818
+ alloc_signal=None, setup_done_signal=None):
819
+ env_vars_json = json.dumps(env_vars_dict)
820
+
821
+ log_dir = shlex.quote(log_dir)
822
+ env_vars = shlex.quote(env_vars_json)
823
+ cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
824
+
825
+ runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
826
+
827
+ if task_name is not None:
828
+ runner_args += f' --task-name={{shlex.quote(task_name)}}'
829
+
830
+ if is_setup:
831
+ runner_args += ' --is-setup'
832
+
833
+ if alloc_signal is not None:
834
+ runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
835
+
836
+ if setup_done_signal is not None:
837
+ runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
838
+
839
+ script_path = None
840
+ prefix = 'sky_setup_' if is_setup else 'sky_task_'
841
+ if backend_utils.is_command_length_over_limit(user_script):
842
+ with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
843
+ f.write(user_script)
844
+ script_path = f.name
845
+ runner_args += f' --script-path={{shlex.quote(script_path)}}'
846
+ else:
847
+ runner_args += f' --script={{shlex.quote(user_script)}}'
848
+
849
+ # Use /usr/bin/env explicitly to work around a Slurm quirk where
850
+ # srun's execvp() doesn't check execute permissions, failing when
851
+ # $HOME/.local/bin/env (non-executable, from uv installation)
852
+ # shadows /usr/bin/env.
853
+ job_suffix = '-setup' if is_setup else ''
854
+ srun_cmd = (
855
+ f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
856
+ f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
857
+ f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
858
+ )
859
+ return srun_cmd, script_path
860
+
861
+ def run_thread_func():
862
+ # This blocks until Slurm allocates resources (--exclusive)
863
+ # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
864
+ run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
865
+ srun_cmd, task_script_path = build_task_runner_cmd(
866
+ script, run_flags, {log_dir!r}, sky_env_vars_dict,
867
+ task_name={task_name!r},
868
+ alloc_signal=alloc_signal_file,
869
+ setup_done_signal=setup_done_signal_file
870
+ )
871
+
872
+ proc = subprocess.Popen(srun_cmd, shell=True,
873
+ stdout=subprocess.PIPE,
874
+ stderr=subprocess.STDOUT,
875
+ text=True)
876
+ for line in proc.stdout:
877
+ print(line, end='', flush=True)
878
+ proc.wait()
879
+
880
+ if task_script_path is not None:
881
+ os.remove(task_script_path)
882
+ return {{'return_code': proc.returncode, 'pid': proc.pid}}
883
+
884
+ run_thread_result = {{'result': None}}
885
+ def run_thread_wrapper():
886
+ run_thread_result['result'] = run_thread_func()
887
+
888
+ run_thread = threading.Thread(target=run_thread_wrapper)
889
+ run_thread.start()
890
+
891
+ # Wait for allocation signal from inside srun
892
+ while not os.path.exists(alloc_signal_file):
893
+ if not run_thread.is_alive():
894
+ # srun failed before creating the signal file.
895
+ run_thread.join()
896
+ result = run_thread_result['result']
897
+ returncode = int(result.get('return_code', 1))
898
+ pid = result.get('pid', os.getpid())
899
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
900
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
901
+ print(msg, flush=True)
902
+ returncodes = [returncode]
903
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
904
+ sys.exit(1)
905
+ time.sleep(0.1)
906
+
907
+ print({streaming_msg!r}, flush=True)
908
+
909
+ if {has_setup_cmd!r}:
910
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
911
+
912
+ # The schedule_step should be called after the job status is set to
913
+ # non-PENDING, otherwise, the scheduler will think the current job
914
+ # is not submitted yet, and skip the scheduling step.
915
+ job_lib.scheduler.schedule_step()
916
+
917
+ # --overlap as we have already secured allocation with the srun for the run section,
918
+ # and otherwise this srun would get blocked and deadlock.
919
+ setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
920
+ setup_srun, setup_script_path = build_task_runner_cmd(
921
+ {self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
922
+ is_setup=True
923
+ )
924
+
925
+ # Run setup srun directly, streaming output to driver stdout
926
+ setup_proc = subprocess.Popen(setup_srun, shell=True,
927
+ stdout=subprocess.PIPE,
928
+ stderr=subprocess.STDOUT,
929
+ text=True)
930
+ for line in setup_proc.stdout:
931
+ print(line, end='', flush=True)
932
+ setup_proc.wait()
933
+
934
+ if setup_script_path is not None:
935
+ os.remove(setup_script_path)
936
+
937
+ setup_returncode = setup_proc.returncode
938
+ if setup_returncode != 0:
939
+ setup_pid = setup_proc.pid
940
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
941
+ msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
942
+ print(msg, flush=True)
943
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
944
+ # Cancel the srun spawned by run_thread_func.
945
+ _cancel_slurm_job_steps()
946
+ sys.exit(1)
947
+
948
+ job_lib.set_job_started({self.job_id!r})
949
+ if not {has_setup_cmd!r}:
950
+ # Need to call schedule_step() to make sure the scheduler
951
+ # schedule the next pending job.
952
+ job_lib.scheduler.schedule_step()
953
+
954
+ # Signal run thread to proceed.
955
+ pathlib.Path(setup_done_signal_file).touch()
956
+
957
+ # Wait for run thread to complete.
958
+ run_thread.join()
959
+ result = run_thread_result['result']
960
+
961
+ # Cleanup signal files
962
+ if os.path.exists(alloc_signal_file):
963
+ os.remove(alloc_signal_file)
964
+ if os.path.exists(setup_done_signal_file):
965
+ os.remove(setup_done_signal_file)
966
+
967
+ returncodes = [int(result.get('return_code', 1))]
968
+ else:
969
+ returncodes = [0]
970
+ """),
971
+ ]
sky/catalog/__init__.py CHANGED
@@ -127,12 +127,9 @@ def list_accelerator_realtime(
127
127
  case_sensitive: bool = True,
128
128
  ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
129
129
  """Lists all accelerators offered by Sky with their realtime availability.
130
-
131
130
  Realtime availability is the total number of accelerators in the cluster
132
131
  and number of accelerators available at the time of the call.
133
-
134
132
  Used for fixed size cluster settings, such as Kubernetes.
135
-
136
133
  Returns:
137
134
  A tuple of three dictionaries mapping canonical accelerator names to:
138
135
  - A list of available counts. (e.g., [1, 2, 4])
@@ -204,6 +204,9 @@ def _list_accelerators(
204
204
  min_quantity_filter = quantity_filter if quantity_filter else 1
205
205
 
206
206
  for node in nodes:
207
+ # Check if node is ready
208
+ node_is_ready = node.is_ready()
209
+
207
210
  for key in keys:
208
211
  if key in node.metadata.labels:
209
212
  accelerator_name = lf.get_accelerator_from_label_value(
@@ -260,6 +263,15 @@ def _list_accelerators(
260
263
  total_accelerators_capacity[
261
264
  accelerator_name] += quantized_count
262
265
 
266
+ # Initialize the total_accelerators_available to make sure the
267
+ # key exists in the dictionary.
268
+ total_accelerators_available[accelerator_name] = (
269
+ total_accelerators_available.get(accelerator_name, 0))
270
+
271
+ # Skip availability counting for not-ready nodes
272
+ if not node_is_ready:
273
+ continue
274
+
263
275
  if error_on_get_allocated_gpu_qty_by_node:
264
276
  # If we can't get the allocated GPU quantity by each node,
265
277
  # we can't get the GPU usage.
@@ -268,10 +280,6 @@ def _list_accelerators(
268
280
 
269
281
  allocated_qty = allocated_qty_by_node[node.metadata.name]
270
282
  accelerators_available = accelerator_count - allocated_qty
271
- # Initialize the total_accelerators_available to make sure the
272
- # key exists in the dictionary.
273
- total_accelerators_available[accelerator_name] = (
274
- total_accelerators_available.get(accelerator_name, 0))
275
283
 
276
284
  if accelerators_available >= min_quantity_filter:
277
285
  quantized_availability = min_quantity_filter * (