PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

sky/__init__.py +4 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +478 -0
sky/backends/backend_utils.py +45 -4
sky/backends/cloud_vm_ray_backend.py +32 -33
sky/backends/task_codegen.py +340 -2
sky/catalog/__init__.py +0 -3
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/slurm_catalog.py +243 -0
sky/check.py +14 -3
sky/client/cli/command.py +329 -22
sky/client/sdk.py +56 -2
sky/clouds/__init__.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/slurm.py +578 -0
sky/clouds/ssh.py +2 -1
sky/clouds/vast.py +10 -0
sky/core.py +128 -36
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +16 -2
sky/global_user_state.py +3 -3
sky/models.py +2 -0
sky/optimizer.py +6 -5
sky/provision/__init__.py +1 -0
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +42 -6
sky/provision/provisioner.py +15 -6
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +572 -0
sky/provision/slurm/utils.py +583 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +10 -6
sky/serve/server/impl.py +1 -1
sky/server/constants.py +1 -1
sky/server/plugins.py +222 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +12 -1
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +5 -1
sky/server/requests/serializers/encoders.py +17 -0
sky/server/requests/serializers/return_value_serializers.py +60 -0
sky/server/server.py +78 -8
sky/server/server_utils.py +30 -0
sky/setup_files/dependencies.py +2 -0
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +34 -9
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +189 -0
sky/skylet/job_lib.py +2 -1
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +8 -0
sky/templates/slurm-ray.yml.j2 +85 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/users/model.conf +1 -1
sky/users/permission.py +24 -1
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/command_runner.py +197 -5
sky/utils/command_runner.pyi +27 -4
sky/utils/common_utils.py +18 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/schemas.py +31 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
/sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -192,18 +192,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
     pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
     'monkey_patches' / 'monkey_patch_ray_up.py')
-# The maximum size of a command line arguments is 128 KB, i.e. the command
-# executed with /bin/sh should be less than 128KB.
-# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
-#
-# If a user have very long run or setup commands, the generated command may
-# exceed the limit, as we directly include scripts in job submission commands.
-# If the command is too long, we instead write it to a file, rsync and execute
-# it.
-#
-# We use 100KB as a threshold to be safe for other arguments that
-# might be added during ssh.
-_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
 _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
     ('too long', 255),
     ('request-uri too large', 1),
@@ -218,18 +206,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
 _CLUSTER_LOCK_TIMEOUT = 5.0
-def _is_command_length_over_limit(command: str) -> bool:
-    """Check if the length of the command exceeds the limit.
-    We calculate the length of the command after quoting the command twice as
-    when it is executed by the CommandRunner, the command will be quoted twice
-    to ensure the correctness, which will add significant length to the command.
-    """
-    quoted_length = len(shlex.quote(shlex.quote(command)))
-    return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
 def _is_message_too_long(returncode: int,
                          output: Optional[str] = None,
                          file_path: Optional[str] = None) -> bool:
@@ -294,6 +270,7 @@ def _get_cluster_config_template(cloud):
         clouds.Lambda: 'lambda-ray.yml.j2',
         clouds.IBM: 'ibm-ray.yml.j2',
         clouds.SCP: 'scp-ray.yml.j2',
+        clouds.Slurm: 'slurm-ray.yml.j2',
         clouds.OCI: 'oci-ray.yml.j2',
         clouds.Paperspace: 'paperspace-ray.yml.j2',
         clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
@@ -2516,7 +2493,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     @property
     def is_grpc_enabled_with_flag(self) -> bool:
         """Returns whether this handle has gRPC enabled and gRPC flag is set."""
-        return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
+        return (env_options.Options.ENABLE_GRPC.get() and
+                self.is_grpc_enabled and
+                not isinstance(self.launched_resources.cloud, clouds.Slurm))
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -3596,6 +3575,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
                detach_setup: bool) -> None:
         start = time.time()
         if task.setup is None:
@@ -3647,7 +3627,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 _dump_final_script(setup_script,
                                    constants.PERSISTENT_SETUP_SCRIPT_PATH)
-            if detach_setup or _is_command_length_over_limit(encoded_script):
+            if (detach_setup or
+                    backend_utils.is_command_length_over_limit(encoded_script)):
                 _dump_final_script(setup_script)
                 create_script_code = 'true'
             else:
@@ -3804,7 +3785,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
-        # Should also be ealier than _is_command_length_over_limit
+        # Should also be ealier than is_command_length_over_limit
         # Same reason as in _setup
         if self._dump_final_script:
             _dump_code_to_file(job_submit_cmd,
@@ -3837,7 +3818,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         tasks=managed_job_tasks,
                         user_id=managed_job_user_id)
-                if _is_command_length_over_limit(codegen):
+                if backend_utils.is_command_length_over_limit(codegen):
                     _dump_code_to_file(codegen)
                     queue_job_request = jobsv1_pb2.QueueJobRequest(
                         job_id=job_id,
@@ -3859,7 +3840,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 use_legacy = True
         if use_legacy:
-            if _is_command_length_over_limit(job_submit_cmd):
+            if backend_utils.is_command_length_over_limit(job_submit_cmd):
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
@@ -5850,6 +5831,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             return task.envs[constants.USER_ID_ENV_VAR]
         return None
+    def _get_task_codegen_class(
+            self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
+        """Returns the appropriate TaskCodeGen for the given handle."""
+        if isinstance(handle.launched_resources.cloud, clouds.Slurm):
+            assert (handle.cached_cluster_info
+                    is not None), ('cached_cluster_info must be set')
+            head_instance = handle.cached_cluster_info.get_head_instance()
+            assert (head_instance is not None), (
+                'Head instance not found in cached cluster info')
+            slurm_job_id = head_instance.tags.get('job_id')
+            assert (slurm_job_id
+                    is not None), ('job_id tag not found in head instance')
+            return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
+        else:
+            return task_codegen.RayCodeGen()
     def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
                                task: task_lib.Task, job_id: int,
                                remote_log_dir: str) -> None:
@@ -5862,15 +5859,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = task_codegen.RayCodeGen()
+        codegen = self._get_task_codegen_class(handle)
         codegen.add_prologue(job_id)
         codegen.add_setup(
             1,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
             env_vars=task_env_vars,
+            log_dir=log_dir,
             setup_cmd=self._setup_cmd,
-            setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
         codegen.add_task(
@@ -5907,15 +5905,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_actual_nodes = task.num_nodes * handle.num_ips_per_node
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = task_codegen.RayCodeGen()
+        codegen = self._get_task_codegen_class(handle)
         codegen.add_prologue(job_id)
         codegen.add_setup(
             num_actual_nodes,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
             env_vars=task_env_vars,
+            log_dir=log_dir,
             setup_cmd=self._setup_cmd,
-            setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
         codegen.add_task(

sky/backends/task_codegen.py CHANGED Viewed

@@ -4,6 +4,7 @@ import copy
 import inspect
 import json
 import math
+import os
 import textwrap
 from typing import Dict, List, Optional, Tuple
@@ -181,8 +182,8 @@ class TaskCodeGen:
         resources_dict: Dict[str, float],
         stable_cluster_internal_ips: List[str],
         env_vars: Dict[str, str],
+        log_dir: str,
         setup_cmd: Optional[str] = None,
-        setup_log_path: Optional[str] = None,
     ) -> None:
         """Generates code to set up the task on each node.
@@ -379,13 +380,15 @@ class RayCodeGen(TaskCodeGen):
         resources_dict: Dict[str, float],
         stable_cluster_internal_ips: List[str],
         env_vars: Dict[str, str],
+        log_dir: str,
         setup_cmd: Optional[str] = None,
-        setup_log_path: Optional[str] = None,
     ) -> None:
         assert self._has_prologue, ('Call add_prologue() before '
                                     'add_setup().')
         self._has_setup = True
+        setup_log_path = os.path.join(log_dir, 'setup.log')
         bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
         # Set CPU to avoid ray hanging the resources allocation
         # for remote functions, since the task will request 1 CPU
@@ -631,3 +634,338 @@ class RayCodeGen(TaskCodeGen):
         """Generates code that waits for all tasks, then exits."""
         self._code.append('returncodes, _ = get_or_fail(futures, pg)')
         super().add_epilogue()
+class SlurmCodeGen(TaskCodeGen):
+    """Code generator for task execution on Slurm using native srun."""
+    def __init__(self, slurm_job_id: str):
+        """Initialize SlurmCodeGen
+        Args:
+            slurm_job_id: The Slurm job ID, i.e. SLURM_JOB_ID
+        """
+        super().__init__()
+        self._slurm_job_id = slurm_job_id
+    def add_prologue(self, job_id: int) -> None:
+        assert not self._has_prologue, 'add_prologue() called twice?'
+        self._has_prologue = True
+        self.job_id = job_id
+        self._add_common_imports()
+        self._code.append(
+            textwrap.dedent("""\
+            import colorama
+            import copy
+            import json
+            import multiprocessing
+            import signal
+            import threading
+            from sky.backends import backend_utils
+            """))
+        self._add_skylet_imports()
+        self._add_constants()
+        self._add_logging_functions()
+        self._code.append(
+            textwrap.dedent(f"""\
+            def _cancel_slurm_job_steps():
+                slurm_job_id = {self._slurm_job_id!r}
+                assert slurm_job_id is not None, 'SLURM_JOB_ID is not set'
+                try:
+                    # Query steps for this job: squeue -s -j JOBID -h -o "%i %j"
+                    # Output format: "JOBID.STEPID STEPNAME"
+                    # TODO(kevin): This assumes that compute node is able
+                    # to run client commands against the controller.
+                    # Validate this assumption.
+                    result = subprocess.run(
+                        ['squeue', '-s', '-j', slurm_job_id, '-h', '-o', '%i %j'],
+                        capture_output=True, text=True, check=False)
+                    for line in result.stdout.strip().split('\\n'):
+                        if not line:
+                            continue
+                        parts = line.split()
+                        assert len(parts) >= 2, 'Expected at least 2 parts'
+                        step_id, step_name = parts[0], parts[1]
+                        if step_name == f'sky-{self.job_id}':
+                            subprocess.run(['scancel', step_id],
+                                            check=False, capture_output=True)
+                except Exception as e:
+                    print(f'Error in _cancel_slurm_job_steps: {{e}}', flush=True)
+                    pass
+            def _slurm_cleanup_handler(signum, _frame):
+                _cancel_slurm_job_steps()
+                # Re-raise to let default handler terminate.
+                signal.signal(signum, signal.SIG_DFL)
+                os.kill(os.getpid(), signum)
+            signal.signal(signal.SIGTERM, _slurm_cleanup_handler)
+            """))
+        self._code += [
+            'autostop_lib.set_last_active_time_to_now()',
+            f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
+        ]
+        self._setup_cmd: Optional[str] = None
+        self._setup_envs: Optional[Dict[str, str]] = None
+        self._setup_log_dir: Optional[str] = None
+        self._setup_num_nodes: Optional[int] = None
+    def add_setup(
+        self,
+        num_nodes: int,
+        resources_dict: Dict[str, float],
+        stable_cluster_internal_ips: List[str],
+        env_vars: Dict[str, str],
+        log_dir: str,
+        setup_cmd: Optional[str] = None,
+    ) -> None:
+        assert self._has_prologue, ('Call add_prologue() before add_setup().')
+        self._has_setup = True
+        self._cluster_num_nodes = len(stable_cluster_internal_ips)
+        self._stable_cluster_ips = stable_cluster_internal_ips
+        self._add_waiting_for_resources_msg(num_nodes)
+        # Store setup information for use in add_task().
+        if setup_cmd is not None:
+            setup_envs = env_vars.copy()
+            setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
+            self._setup_cmd = setup_cmd
+            self._setup_envs = setup_envs
+            self._setup_log_dir = log_dir
+            self._setup_num_nodes = num_nodes
+    def add_task(
+        self,
+        num_nodes: int,
+        bash_script: Optional[str],
+        task_name: Optional[str],
+        resources_dict: Dict[str, float],
+        log_dir: str,
+        env_vars: Optional[Dict[str, str]] = None,
+    ) -> None:
+        """Generates code for invoking a bash command
+        using srun within sbatch allocation.
+        """
+        assert self._has_setup, 'Call add_setup() before add_task().'
+        env_vars = env_vars or {}
+        task_name = task_name if task_name is not None else 'task'
+        acc_name, acc_count = self._get_accelerator_details(resources_dict)
+        num_gpus = 0
+        if (acc_name is not None and
+                not accelerator_registry.is_schedulable_non_gpu_accelerator(
+                    acc_name)):
+            num_gpus = int(math.ceil(acc_count))
+        # Slurm does not support fractional CPUs.
+        task_cpu_demand = int(math.ceil(resources_dict.pop('CPU')))
+        sky_env_vars_dict_str = [
+            textwrap.dedent(f"""\
+            sky_env_vars_dict = {{}}
+            sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
+            """)
+        ]
+        if env_vars:
+            sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
+                                         for k, v in env_vars.items())
+        sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
+        rclone_flush_script = self._get_rclone_flush_script()
+        streaming_msg = self._get_job_started_msg()
+        has_setup_cmd = self._setup_cmd is not None
+        self._code += [
+            sky_env_vars_dict_str,
+            textwrap.dedent(f"""\
+            script = {bash_script!r}
+            if script is None:
+                script = ''
+            rclone_flush_script = {rclone_flush_script!r}
+            if script or {has_setup_cmd!r}:
+                script += rclone_flush_script
+                sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {num_gpus}
+                # Signal files for setup/run synchronization:
+                # 1. alloc_signal_file: srun has acquired allocation
+                # 2. setup_done_signal_file: Driver has finished setup, run can proceed
+                #
+                # Signal files are stored in home directory, which is
+                # assumed to be on a shared NFS mount accessible by all nodes.
+                # To support clusters with non-NFS home directories, we would
+                # need to let users specify an NFS-backed "working directory"
+                # or use a different coordination mechanism.
+                alloc_signal_file = f'~/.sky_alloc_{self._slurm_job_id}_{self.job_id}'
+                alloc_signal_file = os.path.expanduser(alloc_signal_file)
+                setup_done_signal_file = f'~/.sky_setup_done_{self._slurm_job_id}_{self.job_id}'
+                setup_done_signal_file = os.path.expanduser(setup_done_signal_file)
+                # Start exclusive srun in a thread to reserve allocation (similar to ray.get(pg.ready()))
+                gpu_arg = f'--gpus-per-node={num_gpus}' if {num_gpus} > 0 else ''
+                def build_task_runner_cmd(user_script, extra_flags, log_dir, env_vars_dict,
+                                          task_name=None, is_setup=False,
+                                          alloc_signal=None, setup_done_signal=None):
+                    env_vars_json = json.dumps(env_vars_dict)
+                    log_dir = shlex.quote(log_dir)
+                    env_vars = shlex.quote(env_vars_json)
+                    cluster_ips = shlex.quote(",".join({self._stable_cluster_ips!r}))
+                    runner_args = f'--log-dir={{log_dir}} --env-vars={{env_vars}} --cluster-num-nodes={self._cluster_num_nodes} --cluster-ips={{cluster_ips}}'
+                    if task_name is not None:
+                        runner_args += f' --task-name={{shlex.quote(task_name)}}'
+                    if is_setup:
+                        runner_args += ' --is-setup'
+                    if alloc_signal is not None:
+                        runner_args += f' --alloc-signal-file={{shlex.quote(alloc_signal)}}'
+                    if setup_done_signal is not None:
+                        runner_args += f' --setup-done-signal-file={{shlex.quote(setup_done_signal)}}'
+                    script_path = None
+                    prefix = 'sky_setup_' if is_setup else 'sky_task_'
+                    if backend_utils.is_command_length_over_limit(user_script):
+                        with tempfile.NamedTemporaryFile('w', prefix=prefix, suffix='.sh', delete=False) as f:
+                            f.write(user_script)
+                            script_path = f.name
+                        runner_args += f' --script-path={{shlex.quote(script_path)}}'
+                    else:
+                        runner_args += f' --script={{shlex.quote(user_script)}}'
+                    # Use /usr/bin/env explicitly to work around a Slurm quirk where
+                    # srun's execvp() doesn't check execute permissions, failing when
+                    # $HOME/.local/bin/env (non-executable, from uv installation)
+                    # shadows /usr/bin/env.
+                    job_suffix = '-setup' if is_setup else ''
+                    srun_cmd = (
+                        f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
+                        f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
+                        f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
+                    )
+                    return srun_cmd, script_path
+                def run_thread_func():
+                    # This blocks until Slurm allocates resources (--exclusive)
+                    # --mem=0 to match RayCodeGen's behavior where we don't explicitly request memory.
+                    run_flags = f'--nodes={num_nodes} --cpus-per-task={task_cpu_demand} --mem=0 {{gpu_arg}} --exclusive'
+                    srun_cmd, task_script_path = build_task_runner_cmd(
+                        script, run_flags, {log_dir!r}, sky_env_vars_dict,
+                        task_name={task_name!r},
+                        alloc_signal=alloc_signal_file,
+                        setup_done_signal=setup_done_signal_file
+                    )
+                    proc = subprocess.Popen(srun_cmd, shell=True,
+                                          stdout=subprocess.PIPE,
+                                          stderr=subprocess.STDOUT,
+                                          text=True)
+                    for line in proc.stdout:
+                        print(line, end='', flush=True)
+                    proc.wait()
+                    if task_script_path is not None:
+                        os.remove(task_script_path)
+                    return {{'return_code': proc.returncode, 'pid': proc.pid}}
+                run_thread_result = {{'result': None}}
+                def run_thread_wrapper():
+                    run_thread_result['result'] = run_thread_func()
+                run_thread = threading.Thread(target=run_thread_wrapper)
+                run_thread.start()
+                # Wait for allocation signal from inside srun
+                while not os.path.exists(alloc_signal_file):
+                    if not run_thread.is_alive():
+                        # srun failed before creating the signal file.
+                        run_thread.join()
+                        result = run_thread_result['result']
+                        returncode = int(result.get('return_code', 1))
+                        pid = result.get('pid', os.getpid())
+                        msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{returncode}} (pid={{pid}}).'
+                        msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
+                        print(msg, flush=True)
+                        returncodes = [returncode]
+                        job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
+                        sys.exit(1)
+                    time.sleep(0.1)
+                print({streaming_msg!r}, flush=True)
+                if {has_setup_cmd!r}:
+                    job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SETTING_UP)
+                    # The schedule_step should be called after the job status is set to
+                    # non-PENDING, otherwise, the scheduler will think the current job
+                    # is not submitted yet, and skip the scheduling step.
+                    job_lib.scheduler.schedule_step()
+                    # --overlap as we have already secured allocation with the srun for the run section,
+                    # and otherwise this srun would get blocked and deadlock.
+                    setup_flags = f'--overlap --nodes={self._setup_num_nodes}'
+                    setup_srun, setup_script_path = build_task_runner_cmd(
+                        {self._setup_cmd!r}, setup_flags, {self._setup_log_dir!r}, {self._setup_envs!r},
+                        is_setup=True
+                    )
+                    # Run setup srun directly, streaming output to driver stdout
+                    setup_proc = subprocess.Popen(setup_srun, shell=True,
+                                                 stdout=subprocess.PIPE,
+                                                 stderr=subprocess.STDOUT,
+                                                 text=True)
+                    for line in setup_proc.stdout:
+                        print(line, end='', flush=True)
+                    setup_proc.wait()
+                    if setup_script_path is not None:
+                        os.remove(setup_script_path)
+                    setup_returncode = setup_proc.returncode
+                    if setup_returncode != 0:
+                        setup_pid = setup_proc.pid
+                        msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with return code {{setup_returncode}} (pid={{setup_pid}}).'
+                        msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
+                        print(msg, flush=True)
+                        job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
+                        # Cancel the srun spawned by run_thread_func.
+                        _cancel_slurm_job_steps()
+                        sys.exit(1)
+                job_lib.set_job_started({self.job_id!r})
+                if not {has_setup_cmd!r}:
+                    # Need to call schedule_step() to make sure the scheduler
+                    # schedule the next pending job.
+                    job_lib.scheduler.schedule_step()
+                # Signal run thread to proceed.
+                pathlib.Path(setup_done_signal_file).touch()
+                # Wait for run thread to complete.
+                run_thread.join()
+                result = run_thread_result['result']
+                # Cleanup signal files
+                if os.path.exists(alloc_signal_file):
+                    os.remove(alloc_signal_file)
+                if os.path.exists(setup_done_signal_file):
+                    os.remove(setup_done_signal_file)
+                returncodes = [int(result.get('return_code', 1))]
+            else:
+                returncodes = [0]
+            """),
+        ]

sky/catalog/__init__.py CHANGED Viewed

@@ -127,12 +127,9 @@ def list_accelerator_realtime(
     case_sensitive: bool = True,
 ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
     """Lists all accelerators offered by Sky with their realtime availability.
     Realtime availability is the total number of accelerators in the cluster
     and number of accelerators available at the time of the call.
     Used for fixed size cluster settings, such as Kubernetes.
     Returns:
         A tuple of three dictionaries mapping canonical accelerator names to:
         - A list of available counts. (e.g., [1, 2, 4])

sky/catalog/kubernetes_catalog.py CHANGED Viewed

@@ -204,6 +204,9 @@ def _list_accelerators(
     min_quantity_filter = quantity_filter if quantity_filter else 1
     for node in nodes:
+        # Check if node is ready
+        node_is_ready = node.is_ready()
         for key in keys:
             if key in node.metadata.labels:
                 accelerator_name = lf.get_accelerator_from_label_value(
@@ -260,6 +263,15 @@ def _list_accelerators(
                         total_accelerators_capacity[
                             accelerator_name] += quantized_count
+                # Initialize the total_accelerators_available to make sure the
+                # key exists in the dictionary.
+                total_accelerators_available[accelerator_name] = (
+                    total_accelerators_available.get(accelerator_name, 0))
+                # Skip availability counting for not-ready nodes
+                if not node_is_ready:
+                    continue
                 if error_on_get_allocated_gpu_qty_by_node:
                     # If we can't get the allocated GPU quantity by each node,
                     # we can't get the GPU usage.
@@ -268,10 +280,6 @@ def _list_accelerators(
                 allocated_qty = allocated_qty_by_node[node.metadata.name]
                 accelerators_available = accelerator_count - allocated_qty
-                # Initialize the total_accelerators_available to make sure the
-                # key exists in the dictionary.
-                total_accelerators_available[accelerator_name] = (
-                    total_accelerators_available.get(accelerator_name, 0))
                 if accelerators_available >= min_quantity_filter:
                     quantized_availability = min_quantity_filter * (

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl