PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +565 -0
sky/backends/backend_utils.py +95 -12
sky/backends/cloud_vm_ray_backend.py +224 -65
sky/backends/task_codegen.py +380 -4
sky/catalog/__init__.py +0 -3
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +236 -0
sky/catalog/vast_catalog.py +30 -6
sky/check.py +25 -11
sky/client/cli/command.py +391 -32
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +64 -2
sky/client/sdk_async.py +9 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +610 -0
sky/clouds/ssh.py +3 -2
sky/clouds/vast.py +39 -16
sky/core.py +197 -37
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +44 -5
sky/global_user_state.py +111 -19
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +11 -0
sky/optimizer.py +8 -6
sky/provision/__init__.py +12 -9
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +163 -20
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +17 -7
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +618 -0
sky/provision/slurm/utils.py +689 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +11 -6
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/serve/server/impl.py +1 -1
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +238 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +30 -1
sky/server/requests/request_names.py +4 -0
sky/server/requests/requests.py +33 -11
sky/server/requests/serializers/encoders.py +22 -0
sky/server/requests/serializers/return_value_serializers.py +70 -0
sky/server/server.py +506 -109
sky/server/server_utils.py +30 -0
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +22 -9
sky/sky_logging.py +2 -1
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +55 -13
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +187 -0
sky/skylet/job_lib.py +91 -5
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/services.py +18 -3
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +12 -6
sky/templates/slurm-ray.yml.j2 +115 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +18 -41
sky/users/model.conf +1 -1
sky/users/permission.py +85 -52
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +389 -35
sky/utils/command_runner.pyi +43 -4
sky/utils/common_utils.py +47 -31
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +93 -19
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -81,6 +81,7 @@ from sky.utils import timeline
 from sky.utils import ux_utils
 from sky.utils import volume as volume_lib
 from sky.utils import yaml_utils
+from sky.utils.plugin_extensions import ExternalFailureSource
 if typing.TYPE_CHECKING:
     import grpc
@@ -192,18 +193,6 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
     pathlib.Path(directory_utils.get_sky_dir()) / 'backends' /
     'monkey_patches' / 'monkey_patch_ray_up.py')
-# The maximum size of a command line arguments is 128 KB, i.e. the command
-# executed with /bin/sh should be less than 128KB.
-# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
-#
-# If a user have very long run or setup commands, the generated command may
-# exceed the limit, as we directly include scripts in job submission commands.
-# If the command is too long, we instead write it to a file, rsync and execute
-# it.
-#
-# We use 100KB as a threshold to be safe for other arguments that
-# might be added during ssh.
-_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
 _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
     ('too long', 255),
     ('request-uri too large', 1),
@@ -218,18 +207,6 @@ _RESOURCES_UNAVAILABLE_LOG = (
 _CLUSTER_LOCK_TIMEOUT = 5.0
-def _is_command_length_over_limit(command: str) -> bool:
-    """Check if the length of the command exceeds the limit.
-    We calculate the length of the command after quoting the command twice as
-    when it is executed by the CommandRunner, the command will be quoted twice
-    to ensure the correctness, which will add significant length to the command.
-    """
-    quoted_length = len(shlex.quote(shlex.quote(command)))
-    return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
 def _is_message_too_long(returncode: int,
                          output: Optional[str] = None,
                          file_path: Optional[str] = None) -> bool:
@@ -294,6 +271,7 @@ def _get_cluster_config_template(cloud):
         clouds.Lambda: 'lambda-ray.yml.j2',
         clouds.IBM: 'ibm-ray.yml.j2',
         clouds.SCP: 'scp-ray.yml.j2',
+        clouds.Slurm: 'slurm-ray.yml.j2',
         clouds.OCI: 'oci-ray.yml.j2',
         clouds.Paperspace: 'paperspace-ray.yml.j2',
         clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
@@ -938,8 +916,10 @@ class RetryingVmProvisioner(object):
         elif to_provision.region is not None and to_provision.cloud is not None:
             # For public clouds, provision.region is always set.
             if clouds.SSH().is_same_cloud(to_provision.cloud):
+                ssh_node_pool_name = common_utils.removeprefix(
+                    to_provision.region, 'ssh-')
                 message += (
-                    f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
+                    f'in SSH Node Pool ({ssh_node_pool_name}) '
                     f'for {requested_resources}. The SSH Node Pool may not '
                     'have enough resources.')
             elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
@@ -1199,7 +1179,9 @@ class RetryingVmProvisioner(object):
                     if isinstance(to_provision.cloud, clouds.Kubernetes):
                         suffix = '.'
                         if region.name.startswith('ssh-'):
-                            suffix = f' ({region.name.lstrip("ssh-")})'
+                            ssh_node_pool_name = common_utils.removeprefix(
+                                region.name, 'ssh-')
+                            suffix = f' ({ssh_node_pool_name})'
                         logger.info(
                             ux_utils.starting_message(
                                 f'Launching{controller_str} on '
@@ -2516,7 +2498,9 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     @property
     def is_grpc_enabled_with_flag(self) -> bool:
         """Returns whether this handle has gRPC enabled and gRPC flag is set."""
-        return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
+        return (env_options.Options.ENABLE_GRPC.get() and
+                self.is_grpc_enabled and
+                not isinstance(self.launched_resources.cloud, clouds.Slurm))
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -2753,6 +2737,13 @@ class SkyletClient:
     ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
         return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
+    def get_job_exit_codes(
+        self,
+        request: 'jobsv1_pb2.GetJobExitCodesRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobExitCodesResponse':
+        return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
     def tail_logs(
         self,
         request: 'jobsv1_pb2.TailLogsRequest',
@@ -3061,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                  'sky api status -v | grep '
                                                  f'{cluster_name}'))
+    def _maybe_clear_external_cluster_failures(
+            self, cluster_name: str,
+            prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
+        """Clear any existing cluster failures when reusing a cluster.
+        Clear any existing cluster failures when reusing a cluster. This ensures
+        that when a cluster failure is detected (causing the cluster to be
+        marked as INIT), the user can recover the cluster via `sky start` or
+        `sky launch` and clear the failure.
+        """
+        if prev_cluster_status is not None:
+            failures = ExternalFailureSource.clear(cluster_name=cluster_name)
+            if failures:
+                failure_details = [f'"{f["failure_mode"]}"' for f in failures]
+                plural = 's' if len(failures) > 1 else ''
+                logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
+                            f'existing cluster failure{plural} for cluster '
+                            f'{cluster_name!r}: {", ".join(failure_details)}'
+                            f'{colorama.Style.RESET_ALL}')
     def _locked_provision(
         self,
         lock_id: str,
@@ -3091,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 to_provision_config.num_nodes, to_provision_config.resources)
             usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
+            self._maybe_clear_external_cluster_failures(cluster_name,
+                                                        prev_cluster_status)
             # TODO(suquark): once we have sky on PyPI, we should directly
             # install sky from PyPI.
             # NOTE: can take ~2s.
@@ -3449,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 ssh_user=handle.ssh_user,
                 docker_user=handle.docker_user)
             cluster_utils.SSHConfigHelper.add_cluster(
-                handle.cluster_name, handle.cached_external_ips, auth_config,
+                handle.cluster_name, handle.cluster_name_on_cloud,
+                handle.cached_external_ips, auth_config,
                 handle.cached_external_ssh_ports, handle.docker_user,
                 handle.ssh_user)
@@ -3596,6 +3611,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
                detach_setup: bool) -> None:
         start = time.time()
         if task.setup is None:
@@ -3647,7 +3663,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 _dump_final_script(setup_script,
                                    constants.PERSISTENT_SETUP_SCRIPT_PATH)
-            if detach_setup or _is_command_length_over_limit(encoded_script):
+            if (detach_setup or
+                    backend_utils.is_command_length_over_limit(encoded_script)):
                 _dump_final_script(setup_script)
                 create_script_code = 'true'
             else:
@@ -3788,23 +3805,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                   up=True,
                                   stream_logs=False)
-        cd = f'cd {SKY_REMOTE_WORKDIR}'
-        mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
-                      f'touch {remote_log_path}')
+        mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
         encoded_script = shlex.quote(codegen)
         create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
         job_submit_cmd = (
             # JOB_CMD_IDENTIFIER is used for identifying the process
             # retrieved with pid is the same driver process.
             f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
-            f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
+            f'{constants.SKY_PYTHON_CMD} -u {script_path}'
             # Do not use &>, which is not POSIX and may not work.
             # Note that the order of ">filename 2>&1" matters.
             f'> {remote_log_path} 2>&1')
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
+        # For Slurm, we need to wait for the job to complete before exiting,
+        # because Slurm's proctrack/cgroup kills all processes when the srun
+        # job step ends, including child processes launched as a separate
+        # process group.
+        # So this keeps srun alive so the job driver process that was spawned
+        # (and runs in the background) by job_lib.JobScheduler.schedule_step()
+        # does not get killed.
+        # Note: proctrack/cgroup is enabled by default on Nebius' Managed
+        # Soperator.
+        is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
+        if is_slurm:
+            wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
+            code = code + ' && ' + wait_code
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
-        # Should also be ealier than _is_command_length_over_limit
+        # Should also be ealier than is_command_length_over_limit
         # Same reason as in _setup
         if self._dump_final_script:
             _dump_code_to_file(job_submit_cmd,
@@ -3837,7 +3867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         tasks=managed_job_tasks,
                         user_id=managed_job_user_id)
-                if _is_command_length_over_limit(codegen):
+                if backend_utils.is_command_length_over_limit(codegen):
                     _dump_code_to_file(codegen)
                     queue_job_request = jobsv1_pb2.QueueJobRequest(
                         job_id=job_id,
@@ -3859,7 +3889,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 use_legacy = True
         if use_legacy:
-            if _is_command_length_over_limit(job_submit_cmd):
+            if backend_utils.is_command_length_over_limit(job_submit_cmd):
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
@@ -3886,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
-            returncode, stdout, stderr = self.run_on_head(handle,
-                                                          job_submit_cmd,
-                                                          stream_logs=False,
-                                                          require_outputs=True)
+            # For Slurm, run in background so that SSH returns immediately.
+            # This is needed because we add the wait_for_job code above which
+            # makes the command block until the job completes.
+            returncode, stdout, stderr = self.run_on_head(
+                handle,
+                job_submit_cmd,
+                stream_logs=False,
+                require_outputs=True,
+                run_in_background=is_slurm)
             # Happens when someone calls `sky exec` but remote is outdated for
             # running a job. Necessitating calling `sky launch`.
             backend_utils.check_stale_runtime_on_remote(returncode, stderr,
@@ -3906,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
                 job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
+                # See comment above for why run_in_background=is_slurm.
                 returncode, stdout, stderr = self.run_on_head(
                     handle,
                     job_submit_cmd,
                     stream_logs=False,
-                    require_outputs=True)
+                    require_outputs=True,
+                    run_in_background=is_slurm)
             subprocess_utils.handle_returncode(
                 returncode,
@@ -4969,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     ports_cleaned_up = True
                 except exceptions.PortDoesNotExistError:
                     logger.debug('Ports do not exist. Skipping cleanup.')
+                    ports_cleaned_up = True
                 except Exception as e:  # pylint: disable=broad-except
                     if purge:
                         msg = common_utils.format_exception(e, use_bracket=True)
@@ -5041,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     config['provider'],
                     non_terminated_only=False)
-                unexpected_node_state: Optional[Tuple[str, str]] = None
+                unexpected_nodes = []
                 for node_id, node_status_tuple in node_status_dict.items():
                     node_status, reason = node_status_tuple
-                    reason = '' if reason is None else f' ({reason})'
-                    logger.debug(f'{node_id} status: {node_status}{reason}')
+                    reason_str = '' if reason is None else f' ({reason})'
+                    logger.debug(f'{node_id} status: {node_status}{reason_str}')
                     # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
                     # between "stopping/stopped" and "terminating/terminated",
                     # so we allow for either status instead of casing on
@@ -5053,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     if node_status not in [
                             None, status_lib.ClusterStatus.STOPPED
                     ]:
-                        unexpected_node_state = (node_id, node_status)
-                        break
+                        unexpected_nodes.append((node_id, node_status, reason))
-                if unexpected_node_state is None:
+                if not unexpected_nodes:
                     break
                 attempts += 1
                 if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
                     time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
                 else:
-                    (node_id, node_status) = unexpected_node_state
-                    raise RuntimeError(f'Instance {node_id} in unexpected '
-                                       f'state {node_status}.')
+                    unexpected_nodes_str = '\n'.join([
+                        f'  - {node_id}: {node_status}' +
+                        (f' ({reason})' if reason else '')
+                        for node_id, node_status, reason in unexpected_nodes
+                    ])
+                    raise RuntimeError(f'Instances in unexpected state:\n'
+                                       f'{unexpected_nodes_str}')
         # If cluster_yaml is None, the cluster should ensured to be terminated,
         # so we don't need to do the double check.
@@ -5352,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             assert handle is not None
             # Cluster already exists.
             self.check_resources_fit_cluster(handle, task)
             # Use the existing cluster.
             assert handle.launched_resources is not None, (cluster_name, handle)
             # Take a random resource in order to get resource info that applies
@@ -5403,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             for resource in task.resources:
                 assert (resource.cluster_config_overrides ==
                         one_task_resource.cluster_config_overrides)
-            if isinstance(to_provision.cloud, clouds.Kubernetes):
+            cluster_yaml_str = global_user_state.get_cluster_yaml_str(
+                cluster_name)
+            cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
+                                if cluster_yaml_str is not None else None)
+            def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
+                return (yaml_obj.get('available_node_types',
+                                     {}).get('ray_head_default',
+                                             {}).get('node_config', {}))
+            if isinstance(to_provision.cloud,
+                          clouds.Kubernetes) and cluster_yaml_obj is not None:
                 # Warn users if the Kubernetes pod config is different
                 # from the existing cluster.
-                cluster_yaml_str = global_user_state.get_cluster_yaml_str(
-                    cluster_name)
-                actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
                 desired_cluster_yaml_obj = (
                     kubernetes_utils.combine_pod_config_fields_and_metadata(
-                        actual_cluster_yaml_obj,
+                        cluster_yaml_obj,
                         cluster_config_overrides=one_task_resource.
                         cluster_config_overrides,
                         cloud=to_provision.cloud,
                         context=to_provision.region))
-                def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
-                    return (yaml_obj.get('available_node_types',
-                                         {}).get('ray_head_default',
-                                                 {}).get('node_config', {}))
                 if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
-                        actual_cluster_yaml_obj):
+                        cluster_yaml_obj):
                     # pylint: disable=line-too-long
                     logger.warning(
                         f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
@@ -5434,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         f'  • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
                         f'{colorama.Style.RESET_ALL}')
+            # Check for volume mount warnings
+            if task.volume_mounts:
+                # Get existing cluster's volume mounts from cluster yaml
+                existing_volume_names = set()
+                try:
+                    if cluster_yaml_obj is not None:
+                        # Extract volume names from existing cluster
+                        node_config = _get_pod_config(cluster_yaml_obj)
+                        if isinstance(to_provision.cloud, clouds.Kubernetes):
+                            # Check for K8s-style persistent volumes
+                            # (spec.volumes)
+                            # See sky/templates/kubernetes-ray.yml.j2.
+                            volumes = node_config.get('spec',
+                                                      {}).get('volumes', [])
+                            for vol in volumes:
+                                # Volume from PVC has structure:
+                                # - name: <volume_name>
+                                #   persistentVolumeClaim:
+                                #     claimName: <volume_name_on_cloud>
+                                if 'persistentVolumeClaim' in vol:
+                                    pvc = vol.get('persistentVolumeClaim', {})
+                                    # Use claimName (volume_name_on_cloud) to
+                                    # be consistent with RunPod.
+                                    vol_name_on_cloud = pvc.get('claimName')
+                                    if vol_name_on_cloud:
+                                        existing_volume_names.add(
+                                            vol_name_on_cloud)
+                            # Check for K8s ephemeral volumes
+                            # See sky/templates/kubernetes-ray.yml.j2.
+                            provider_config = cluster_yaml_obj.get(
+                                'provider', {})
+                            ephemeral_specs = provider_config.get(
+                                'ephemeral_volume_specs', [])
+                            for spec in ephemeral_specs:
+                                # For ephemeral volumes, we check the mount
+                                # path.
+                                mount_path = spec.get('path')
+                                if mount_path:
+                                    existing_volume_names.add(mount_path)
+                        elif isinstance(to_provision.cloud, clouds.RunPod):
+                            # Check for custom VolumeMounts config
+                            # (e.g. RunPod)
+                            # See sky/templates/runpod-ray.yml.j2.
+                            volume_mounts_config = node_config.get(
+                                'VolumeMounts', [])
+                            for vol_mount in volume_mounts_config:
+                                vol_name = vol_mount.get('VolumeNameOnCloud')
+                                if vol_name:
+                                    existing_volume_names.add(vol_name)
+                except Exception as e:  # pylint: disable=broad-except
+                    # If we can't get the existing volume mounts, log debug
+                    # and skip the warning check
+                    logger.debug(f'Failed to check existing volume mounts: {e}',
+                                 exc_info=True)
+                # Check if task has new volumes not in existing cluster
+                new_ephemeral_volumes = []
+                new_persistent_volumes = []
+                for volume_mount in task.volume_mounts:
+                    # Compare using volume_name for user-facing name
+                    if volume_mount.is_ephemeral:
+                        if volume_mount.path not in existing_volume_names:
+                            new_ephemeral_volumes.append(volume_mount.path)
+                    elif (volume_mount.volume_name not in existing_volume_names
+                          and volume_mount.volume_config.name_on_cloud
+                          not in existing_volume_names):
+                        new_persistent_volumes.append(volume_mount.volume_name)
+                if new_ephemeral_volumes or new_persistent_volumes:
+                    msg_parts = []
+                    if new_ephemeral_volumes:
+                        msg_parts.append(f'new ephemeral volume(s) with path '
+                                         f'{", ".join(new_ephemeral_volumes)}')
+                    if new_persistent_volumes:
+                        msg_parts.append(
+                            f'new volume(s) {", ".join(new_persistent_volumes)}'
+                        )
+                    volume_msg = ' and '.join(msg_parts)
+                    # Capitalize the first letter of the message
+                    volume_msg = volume_msg[0].upper() + volume_msg[1:]
+                    logger.warning(
+                        f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
+                        f'specified in task but not '
+                        f'mounted to existing cluster "{cluster_name}". '
+                        f'These volumes will not be mounted to the cluster. '
+                        f'To mount new volumes, either:\n'
+                        f'  • Use a new cluster, or\n'
+                        f'  • Terminate and recreate this cluster'
+                        f'{colorama.Style.RESET_ALL}')
             return RetryingVmProvisioner.ToProvisionConfig(
                 cluster_name,
                 to_provision,
@@ -5850,6 +5991,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             return task.envs[constants.USER_ID_ENV_VAR]
         return None
+    def _get_task_codegen_class(
+            self, handle: CloudVmRayResourceHandle) -> task_codegen.TaskCodeGen:
+        """Returns the appropriate TaskCodeGen for the given handle."""
+        if isinstance(handle.launched_resources.cloud, clouds.Slurm):
+            assert (handle.cached_cluster_info
+                    is not None), ('cached_cluster_info must be set')
+            head_instance = handle.cached_cluster_info.get_head_instance()
+            assert (head_instance is not None), (
+                'Head instance not found in cached cluster info')
+            slurm_job_id = head_instance.tags.get('job_id')
+            assert (slurm_job_id
+                    is not None), ('job_id tag not found in head instance')
+            return task_codegen.SlurmCodeGen(slurm_job_id=slurm_job_id)
+        else:
+            return task_codegen.RayCodeGen()
     def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
                                task: task_lib.Task, job_id: int,
                                remote_log_dir: str) -> None:
@@ -5862,15 +6019,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = task_codegen.RayCodeGen()
+        codegen = self._get_task_codegen_class(handle)
         codegen.add_prologue(job_id)
         codegen.add_setup(
             1,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
             env_vars=task_env_vars,
+            log_dir=log_dir,
             setup_cmd=self._setup_cmd,
-            setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
         codegen.add_task(
@@ -5907,15 +6065,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_actual_nodes = task.num_nodes * handle.num_ips_per_node
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = task_codegen.RayCodeGen()
+        codegen = self._get_task_codegen_class(handle)
         codegen.add_prologue(job_id)
         codegen.add_setup(
             num_actual_nodes,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
             env_vars=task_env_vars,
+            log_dir=log_dir,
             setup_cmd=self._setup_cmd,
-            setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
         codegen.add_task(

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl