PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

sky/__init__.py +4 -2
sky/adaptors/slurm.py +159 -72
sky/backends/backend_utils.py +52 -10
sky/backends/cloud_vm_ray_backend.py +192 -32
sky/backends/task_codegen.py +40 -2
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +0 -7
sky/catalog/vast_catalog.py +30 -6
sky/check.py +11 -8
sky/client/cli/command.py +106 -54
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +8 -0
sky/client/sdk_async.py +9 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +44 -12
sky/clouds/ssh.py +1 -1
sky/clouds/vast.py +30 -17
sky/core.py +69 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +29 -4
sky/global_user_state.py +108 -16
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +9 -0
sky/optimizer.py +2 -1
sky/provision/__init__.py +11 -9
sky/provision/kubernetes/utils.py +122 -15
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +2 -1
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/instance.py +75 -29
sky/provision/slurm/utils.py +213 -107
sky/provision/vast/utils.py +1 -0
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +16 -0
sky/server/requests/payloads.py +18 -0
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +28 -10
sky/server/requests/serializers/encoders.py +5 -0
sky/server/requests/serializers/return_value_serializers.py +14 -4
sky/server/server.py +434 -107
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +21 -10
sky/sky_logging.py +2 -1
sky/skylet/constants.py +22 -5
sky/skylet/executor/slurm.py +4 -6
sky/skylet/job_lib.py +89 -4
sky/skylet/services.py +18 -3
sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/templates/kubernetes-ray.yml.j2 +4 -6
sky/templates/slurm-ray.yml.j2 +32 -2
sky/templates/websocket_proxy.py +18 -41
sky/users/permission.py +61 -51
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +256 -94
sky/utils/command_runner.pyi +16 -0
sky/utils/common_utils.py +30 -29
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +63 -20
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
/sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -81,6 +81,7 @@ from sky.utils import timeline
 from sky.utils import ux_utils
 from sky.utils import volume as volume_lib
 from sky.utils import yaml_utils
+from sky.utils.plugin_extensions import ExternalFailureSource
 if typing.TYPE_CHECKING:
     import grpc
@@ -915,8 +916,10 @@ class RetryingVmProvisioner(object):
         elif to_provision.region is not None and to_provision.cloud is not None:
             # For public clouds, provision.region is always set.
             if clouds.SSH().is_same_cloud(to_provision.cloud):
+                ssh_node_pool_name = common_utils.removeprefix(
+                    to_provision.region, 'ssh-')
                 message += (
-                    f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
+                    f'in SSH Node Pool ({ssh_node_pool_name}) '
                     f'for {requested_resources}. The SSH Node Pool may not '
                     'have enough resources.')
             elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
@@ -1176,7 +1179,9 @@ class RetryingVmProvisioner(object):
                     if isinstance(to_provision.cloud, clouds.Kubernetes):
                         suffix = '.'
                         if region.name.startswith('ssh-'):
-                            suffix = f' ({region.name.lstrip("ssh-")})'
+                            ssh_node_pool_name = common_utils.removeprefix(
+                                region.name, 'ssh-')
+                            suffix = f' ({ssh_node_pool_name})'
                         logger.info(
                             ux_utils.starting_message(
                                 f'Launching{controller_str} on '
@@ -2732,6 +2737,13 @@ class SkyletClient:
     ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
         return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
+    def get_job_exit_codes(
+        self,
+        request: 'jobsv1_pb2.GetJobExitCodesRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobExitCodesResponse':
+        return self._jobs_stub.GetJobExitCodes(request, timeout=timeout)
     def tail_logs(
         self,
         request: 'jobsv1_pb2.TailLogsRequest',
@@ -3040,6 +3052,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                  'sky api status -v | grep '
                                                  f'{cluster_name}'))
+    def _maybe_clear_external_cluster_failures(
+            self, cluster_name: str,
+            prev_cluster_status: Optional[status_lib.ClusterStatus]) -> None:
+        """Clear any existing cluster failures when reusing a cluster.
+        Clear any existing cluster failures when reusing a cluster. This ensures
+        that when a cluster failure is detected (causing the cluster to be
+        marked as INIT), the user can recover the cluster via `sky start` or
+        `sky launch` and clear the failure.
+        """
+        if prev_cluster_status is not None:
+            failures = ExternalFailureSource.clear(cluster_name=cluster_name)
+            if failures:
+                failure_details = [f'"{f["failure_mode"]}"' for f in failures]
+                plural = 's' if len(failures) > 1 else ''
+                logger.info(f'{colorama.Style.DIM}Cleared {len(failures)} '
+                            f'existing cluster failure{plural} for cluster '
+                            f'{cluster_name!r}: {", ".join(failure_details)}'
+                            f'{colorama.Style.RESET_ALL}')
     def _locked_provision(
         self,
         lock_id: str,
@@ -3070,6 +3102,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 to_provision_config.num_nodes, to_provision_config.resources)
             usage_lib.messages.usage.update_cluster_status(prev_cluster_status)
+            self._maybe_clear_external_cluster_failures(cluster_name,
+                                                        prev_cluster_status)
             # TODO(suquark): once we have sky on PyPI, we should directly
             # install sky from PyPI.
             # NOTE: can take ~2s.
@@ -3428,7 +3463,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 ssh_user=handle.ssh_user,
                 docker_user=handle.docker_user)
             cluster_utils.SSHConfigHelper.add_cluster(
-                handle.cluster_name, handle.cached_external_ips, auth_config,
+                handle.cluster_name, handle.cluster_name_on_cloud,
+                handle.cached_external_ips, auth_config,
                 handle.cached_external_ssh_ports, handle.docker_user,
                 handle.ssh_user)
@@ -3769,20 +3805,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                   up=True,
                                   stream_logs=False)
-        cd = f'cd {SKY_REMOTE_WORKDIR}'
-        mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
-                      f'touch {remote_log_path}')
+        mkdir_code = f'mkdir -p {remote_log_dir} && touch {remote_log_path}'
         encoded_script = shlex.quote(codegen)
         create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
         job_submit_cmd = (
             # JOB_CMD_IDENTIFIER is used for identifying the process
             # retrieved with pid is the same driver process.
             f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
-            f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
+            f'{constants.SKY_PYTHON_CMD} -u {script_path}'
             # Do not use &>, which is not POSIX and may not work.
             # Note that the order of ">filename 2>&1" matters.
             f'> {remote_log_path} 2>&1')
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
+        # For Slurm, we need to wait for the job to complete before exiting,
+        # because Slurm's proctrack/cgroup kills all processes when the srun
+        # job step ends, including child processes launched as a separate
+        # process group.
+        # So this keeps srun alive so the job driver process that was spawned
+        # (and runs in the background) by job_lib.JobScheduler.schedule_step()
+        # does not get killed.
+        # Note: proctrack/cgroup is enabled by default on Nebius' Managed
+        # Soperator.
+        is_slurm = isinstance(handle.launched_resources.cloud, clouds.Slurm)
+        if is_slurm:
+            wait_code = job_lib.JobLibCodeGen.wait_for_job(job_id)
+            code = code + ' && ' + wait_code
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
         # Should also be ealier than is_command_length_over_limit
@@ -3867,10 +3916,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
-            returncode, stdout, stderr = self.run_on_head(handle,
-                                                          job_submit_cmd,
-                                                          stream_logs=False,
-                                                          require_outputs=True)
+            # For Slurm, run in background so that SSH returns immediately.
+            # This is needed because we add the wait_for_job code above which
+            # makes the command block until the job completes.
+            returncode, stdout, stderr = self.run_on_head(
+                handle,
+                job_submit_cmd,
+                stream_logs=False,
+                require_outputs=True,
+                run_in_background=is_slurm)
             # Happens when someone calls `sky exec` but remote is outdated for
             # running a job. Necessitating calling `sky launch`.
             backend_utils.check_stale_runtime_on_remote(returncode, stderr,
@@ -3887,11 +3941,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 _dump_code_to_file(codegen)
                 job_submit_cmd = f'{mkdir_code} && {code}'
                 job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
+                # See comment above for why run_in_background=is_slurm.
                 returncode, stdout, stderr = self.run_on_head(
                     handle,
                     job_submit_cmd,
                     stream_logs=False,
-                    require_outputs=True)
+                    require_outputs=True,
+                    run_in_background=is_slurm)
             subprocess_utils.handle_returncode(
                 returncode,
@@ -4950,6 +5006,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     ports_cleaned_up = True
                 except exceptions.PortDoesNotExistError:
                     logger.debug('Ports do not exist. Skipping cleanup.')
+                    ports_cleaned_up = True
                 except Exception as e:  # pylint: disable=broad-except
                     if purge:
                         msg = common_utils.format_exception(e, use_bracket=True)
@@ -5022,11 +5079,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     config['provider'],
                     non_terminated_only=False)
-                unexpected_node_state: Optional[Tuple[str, str]] = None
+                unexpected_nodes = []
                 for node_id, node_status_tuple in node_status_dict.items():
                     node_status, reason = node_status_tuple
-                    reason = '' if reason is None else f' ({reason})'
-                    logger.debug(f'{node_id} status: {node_status}{reason}')
+                    reason_str = '' if reason is None else f' ({reason})'
+                    logger.debug(f'{node_id} status: {node_status}{reason_str}')
                     # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
                     # between "stopping/stopped" and "terminating/terminated",
                     # so we allow for either status instead of casing on
@@ -5034,19 +5091,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     if node_status not in [
                             None, status_lib.ClusterStatus.STOPPED
                     ]:
-                        unexpected_node_state = (node_id, node_status)
-                        break
+                        unexpected_nodes.append((node_id, node_status, reason))
-                if unexpected_node_state is None:
+                if not unexpected_nodes:
                     break
                 attempts += 1
                 if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
                     time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
                 else:
-                    (node_id, node_status) = unexpected_node_state
-                    raise RuntimeError(f'Instance {node_id} in unexpected '
-                                       f'state {node_status}.')
+                    unexpected_nodes_str = '\n'.join([
+                        f'  - {node_id}: {node_status}' +
+                        (f' ({reason})' if reason else '')
+                        for node_id, node_status, reason in unexpected_nodes
+                    ])
+                    raise RuntimeError(f'Instances in unexpected state:\n'
+                                       f'{unexpected_nodes_str}')
         # If cluster_yaml is None, the cluster should ensured to be terminated,
         # so we don't need to do the double check.
@@ -5333,6 +5393,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             assert handle is not None
             # Cluster already exists.
             self.check_resources_fit_cluster(handle, task)
             # Use the existing cluster.
             assert handle.launched_resources is not None, (cluster_name, handle)
             # Take a random resource in order to get resource info that applies
@@ -5384,27 +5445,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             for resource in task.resources:
                 assert (resource.cluster_config_overrides ==
                         one_task_resource.cluster_config_overrides)
-            if isinstance(to_provision.cloud, clouds.Kubernetes):
+            cluster_yaml_str = global_user_state.get_cluster_yaml_str(
+                cluster_name)
+            cluster_yaml_obj = (yaml_utils.safe_load(cluster_yaml_str)
+                                if cluster_yaml_str is not None else None)
+            def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
+                return (yaml_obj.get('available_node_types',
+                                     {}).get('ray_head_default',
+                                             {}).get('node_config', {}))
+            if isinstance(to_provision.cloud,
+                          clouds.Kubernetes) and cluster_yaml_obj is not None:
                 # Warn users if the Kubernetes pod config is different
                 # from the existing cluster.
-                cluster_yaml_str = global_user_state.get_cluster_yaml_str(
-                    cluster_name)
-                actual_cluster_yaml_obj = yaml_utils.safe_load(cluster_yaml_str)
                 desired_cluster_yaml_obj = (
                     kubernetes_utils.combine_pod_config_fields_and_metadata(
-                        actual_cluster_yaml_obj,
+                        cluster_yaml_obj,
                         cluster_config_overrides=one_task_resource.
                         cluster_config_overrides,
                         cloud=to_provision.cloud,
                         context=to_provision.region))
-                def _get_pod_config(yaml_obj: Dict[str, Any]) -> Dict[str, Any]:
-                    return (yaml_obj.get('available_node_types',
-                                         {}).get('ray_head_default',
-                                                 {}).get('node_config', {}))
                 if _get_pod_config(desired_cluster_yaml_obj) != _get_pod_config(
-                        actual_cluster_yaml_obj):
+                        cluster_yaml_obj):
                     # pylint: disable=line-too-long
                     logger.warning(
                         f'{colorama.Fore.YELLOW}WARNING: Kubernetes pod config mismatch detected. Task requires different '
@@ -5415,6 +5480,101 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         f'  • Or restart this cluster: sky down {cluster_name}; sky launch -c {cluster_name} ...'
                         f'{colorama.Style.RESET_ALL}')
+            # Check for volume mount warnings
+            if task.volume_mounts:
+                # Get existing cluster's volume mounts from cluster yaml
+                existing_volume_names = set()
+                try:
+                    if cluster_yaml_obj is not None:
+                        # Extract volume names from existing cluster
+                        node_config = _get_pod_config(cluster_yaml_obj)
+                        if isinstance(to_provision.cloud, clouds.Kubernetes):
+                            # Check for K8s-style persistent volumes
+                            # (spec.volumes)
+                            # See sky/templates/kubernetes-ray.yml.j2.
+                            volumes = node_config.get('spec',
+                                                      {}).get('volumes', [])
+                            for vol in volumes:
+                                # Volume from PVC has structure:
+                                # - name: <volume_name>
+                                #   persistentVolumeClaim:
+                                #     claimName: <volume_name_on_cloud>
+                                if 'persistentVolumeClaim' in vol:
+                                    pvc = vol.get('persistentVolumeClaim', {})
+                                    # Use claimName (volume_name_on_cloud) to
+                                    # be consistent with RunPod.
+                                    vol_name_on_cloud = pvc.get('claimName')
+                                    if vol_name_on_cloud:
+                                        existing_volume_names.add(
+                                            vol_name_on_cloud)
+                            # Check for K8s ephemeral volumes
+                            # See sky/templates/kubernetes-ray.yml.j2.
+                            provider_config = cluster_yaml_obj.get(
+                                'provider', {})
+                            ephemeral_specs = provider_config.get(
+                                'ephemeral_volume_specs', [])
+                            for spec in ephemeral_specs:
+                                # For ephemeral volumes, we check the mount
+                                # path.
+                                mount_path = spec.get('path')
+                                if mount_path:
+                                    existing_volume_names.add(mount_path)
+                        elif isinstance(to_provision.cloud, clouds.RunPod):
+                            # Check for custom VolumeMounts config
+                            # (e.g. RunPod)
+                            # See sky/templates/runpod-ray.yml.j2.
+                            volume_mounts_config = node_config.get(
+                                'VolumeMounts', [])
+                            for vol_mount in volume_mounts_config:
+                                vol_name = vol_mount.get('VolumeNameOnCloud')
+                                if vol_name:
+                                    existing_volume_names.add(vol_name)
+                except Exception as e:  # pylint: disable=broad-except
+                    # If we can't get the existing volume mounts, log debug
+                    # and skip the warning check
+                    logger.debug(f'Failed to check existing volume mounts: {e}',
+                                 exc_info=True)
+                # Check if task has new volumes not in existing cluster
+                new_ephemeral_volumes = []
+                new_persistent_volumes = []
+                for volume_mount in task.volume_mounts:
+                    # Compare using volume_name for user-facing name
+                    if volume_mount.is_ephemeral:
+                        if volume_mount.path not in existing_volume_names:
+                            new_ephemeral_volumes.append(volume_mount.path)
+                    elif (volume_mount.volume_name not in existing_volume_names
+                          and volume_mount.volume_config.name_on_cloud
+                          not in existing_volume_names):
+                        new_persistent_volumes.append(volume_mount.volume_name)
+                if new_ephemeral_volumes or new_persistent_volumes:
+                    msg_parts = []
+                    if new_ephemeral_volumes:
+                        msg_parts.append(f'new ephemeral volume(s) with path '
+                                         f'{", ".join(new_ephemeral_volumes)}')
+                    if new_persistent_volumes:
+                        msg_parts.append(
+                            f'new volume(s) {", ".join(new_persistent_volumes)}'
+                        )
+                    volume_msg = ' and '.join(msg_parts)
+                    # Capitalize the first letter of the message
+                    volume_msg = volume_msg[0].upper() + volume_msg[1:]
+                    logger.warning(
+                        f'{colorama.Fore.YELLOW}WARNING: {volume_msg} '
+                        f'specified in task but not '
+                        f'mounted to existing cluster "{cluster_name}". '
+                        f'These volumes will not be mounted to the cluster. '
+                        f'To mount new volumes, either:\n'
+                        f'  • Use a new cluster, or\n'
+                        f'  • Terminate and recreate this cluster'
+                        f'{colorama.Style.RESET_ALL}')
             return RetryingVmProvisioner.ToProvisionConfig(
                 cluster_name,
                 to_provision,

sky/backends/task_codegen.py CHANGED Viewed

@@ -147,6 +147,7 @@ class TaskCodeGen:
         if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
            [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
            [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
+            FLUSH_START_TIME=$(date +%s)
             flushed=0
             # extra second on top of --vfs-cache-poll-interval to
             # avoid race condition between rclone log line creation and this check.
@@ -159,13 +160,32 @@ class TaskCodeGen:
                     exitcode=0
                     tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
                     if [ $exitcode -ne 0 ]; then
-                        echo "skypilot: cached mount is still uploading to remote"
+                        ELAPSED=$(($(date +%s) - FLUSH_START_TIME))
+                        # Extract the last vfs cache status line to show what we're waiting for
+                        CACHE_STATUS=$(tac $file | grep "vfs cache: cleaned:" -m 1 | sed 's/.*vfs cache: cleaned: //' 2>/dev/null)
+                        # Extract currently uploading files from recent log lines (show up to 2 files)
+                        UPLOADING_FILES=$(tac $file | head -30 | grep -E "queuing for upload" | head -2 | sed 's/.*INFO  : //' | sed 's/: vfs cache:.*//' | tr '\\n' ',' | sed 's/,$//' | sed 's/,/, /g' 2>/dev/null)
+                        # Build status message with available info
+                        if [ -n "$CACHE_STATUS" ] && [ -n "$UPLOADING_FILES" ]; then
+                            echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}] uploading: ${{UPLOADING_FILES}}"
+                        elif [ -n "$CACHE_STATUS" ]; then
+                            echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) [${{CACHE_STATUS}}]"
+                        else
+                            # Fallback: show last non-empty line from log
+                            LAST_LINE=$(tac $file | grep -v "^$" | head -1 | sed 's/.*INFO  : //' | sed 's/.*ERROR : //' | sed 's/.*NOTICE: //' 2>/dev/null)
+                            if [ -n "$LAST_LINE" ]; then
+                                echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s) ${{LAST_LINE}}"
+                            else
+                                echo "skypilot: cached mount is still uploading (elapsed: ${{ELAPSED}}s)"
+                            fi
+                        fi
                         flushed=0
                         break
                     fi
                 done
             done
-            echo "skypilot: cached mount uploaded complete"
+            TOTAL_FLUSH_TIME=$(($(date +%s) - FLUSH_START_TIME))
+            echo "skypilot: cached mount upload complete (took ${{TOTAL_FLUSH_TIME}}s)"
         fi""")
     def add_prologue(self, job_id: int) -> None:
@@ -214,6 +234,9 @@ class TaskCodeGen:
         self._code += [
             textwrap.dedent(f"""\
             if sum(returncodes) != 0:
+                # Save exit codes to job metadata for potential recovery logic
+                if int(constants.SKYLET_VERSION) >= 28:
+                    job_lib.set_exit_codes({self.job_id!r}, returncodes)
                 job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
                 # Schedule the next pending job immediately to make the job
                 # scheduling more efficient.
@@ -483,6 +506,8 @@ class RayCodeGen(TaskCodeGen):
                     msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
                     msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
                     print(msg, flush=True)
+                    if int(constants.SKYLET_VERSION) >= 28:
+                        job_lib.set_exit_codes({self.job_id!r}, setup_returncodes)
                     job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
                     # This waits for all streaming logs to finish.
                     time.sleep(1)
@@ -851,7 +876,18 @@ class SlurmCodeGen(TaskCodeGen):
                     # $HOME/.local/bin/env (non-executable, from uv installation)
                     # shadows /usr/bin/env.
                     job_suffix = '-setup' if is_setup else ''
+                    # Unset SLURM_* environment variables before running srun.
+                    # When this srun runs inside another srun (from
+                    # SlurmCommandRunner.run), inherited variables like
+                    # SLURM_CPU_BIND, SLURM_NNODES, and SLURM_NODELIST constrain
+                    # the inner srun to the parent step's allocation. This causes
+                    # "CPU binding outside of job step allocation" errors.
+                    # Unsetting all SLURM_* variables allows this srun to access the full job
+                    # allocation. See:
+                    # https://support.schedmd.com/show_bug.cgi?id=14298
+                    # https://github.com/huggingface/datatrove/issues/248
                     srun_cmd = (
+                        "unset $(env | awk -F= '/^SLURM_/ {{print $1}}') && "
                         f'srun --export=ALL --quiet --unbuffered --kill-on-bad-exit --jobid={self._slurm_job_id} '
                         f'--job-name=sky-{self.job_id}{{job_suffix}} --ntasks-per-node=1 {{extra_flags}} '
                         f'{{constants.SKY_SLURM_PYTHON_CMD}} -m sky.skylet.executor.slurm {{runner_args}}'
@@ -900,6 +936,8 @@ class SlurmCodeGen(TaskCodeGen):
                         msg += f' See error logs above for more details.{colorama.Style.RESET_ALL}'
                         print(msg, flush=True)
                         returncodes = [returncode]
+                        if int(constants.SKYLET_VERSION) >= 28:
+                            job_lib.set_exit_codes({self.job_id!r}, returncodes)
                         job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
                         sys.exit(1)
                     time.sleep(0.1)

sky/catalog/data_fetchers/fetch_gcp.py CHANGED Viewed

@@ -189,6 +189,9 @@ SERIES_TO_DESCRIPTION = {
     'c2': 'Compute optimized',
     'c2d': 'C2D AMD Instance',
     'c3': 'C3 Instance',
+    'c3d': 'C3D Instance',
+    'c4': 'C4 Instance',
+    'c4d': 'C4D Instance',
     'e2': 'E2 Instance',
     'f1': 'Micro Instance with burstable CPU',
     'g1': 'Small Instance with 1 VCPU',
@@ -376,8 +379,13 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
                 is_cpu = True
             elif resource_group == 'RAM':
                 is_memory = True
+            elif resource_group == 'LocalSSD':
+                # Ignore local SSD pricing for now, as we do not include disk
+                # pricing for instances for now.
+                # TODO(zhwu): Handle local SSD pricing.
+                pass
             else:
-                assert resource_group == 'N1Standard'
+                assert resource_group == 'N1Standard', (resource_group, sku)
                 if 'Core' in description:
                     is_cpu = True
                 elif 'Ram' in description:

sky/catalog/data_fetchers/fetch_nebius.py CHANGED Viewed

@@ -180,7 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
         presets (List[PresetInfo]): A list of PresetInfo objects to write.
         output_file (str): The path to the output CSV file.
     """
-    os.makedirs(os.path.dirname(output_file))
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
     # Set up the CSV writer to output to stdout
     with open(output_file, 'w', encoding='utf-8') as out:
         header = [

sky/catalog/data_fetchers/fetch_vast.py CHANGED Viewed

@@ -50,7 +50,7 @@ if __name__ == '__main__':
                    ('num_gpus', 'AcceleratorCount'), ('cpu_cores', 'vCPUs'),
                    ('cpu_ram', 'MemoryGiB'), ('gpu_name', 'GpuInfo'),
                    ('search.totalHour', 'Price'), ('min_bid', 'SpotPrice'),
-                   ('geolocation', 'Region'))
+                   ('geolocation', 'Region'), ('hosting_type', 'HostingType'))
     # Vast has a wide variety of machines, some of
     # which will have less diskspace and network
@@ -138,7 +138,9 @@ if __name__ == '__main__':
         maxBid = max([x.get('SpotPrice') for x in toList])
         for instance in toList:
-            stub = f'{instance["InstanceType"]} {instance["Region"][-2:]}'
+            hosting_type = instance.get('HostingType', 0)
+            stub = (f'{instance["InstanceType"]} '
+                    f'{instance["Region"][-2:]} {hosting_type}')
             if stub in seen:
                 printstub = f'{stub}#print'
                 if printstub not in seen:

sky/catalog/seeweb_catalog.py CHANGED Viewed

@@ -7,22 +7,33 @@ query instance types and pricing information for Seeweb.
 import typing
 from typing import Dict, List, Optional, Tuple
-import pandas as pd
+from sky.adaptors import common as adaptors_common
 from sky.catalog import common
 from sky.utils import resources_utils
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
+    import pandas as pd
     from sky.clouds import cloud
+else:
+    pd = adaptors_common.LazyImport('pandas')
 _PULL_FREQUENCY_HOURS = 8
-_df = common.read_catalog('seeweb/vms.csv',
-                          pull_frequency_hours=_PULL_FREQUENCY_HOURS)
+_df = None
+def _get_df():
+    """Get the dataframe, loading it lazily if needed."""
+    global _df
+    if _df is None:
+        _df = common.read_catalog('seeweb/vms.csv',
+                                  pull_frequency_hours=_PULL_FREQUENCY_HOURS)
+    return _df
 def instance_type_exists(instance_type: str) -> bool:
-    result = common.instance_type_exists_impl(_df, instance_type)
+    result = common.instance_type_exists_impl(_get_df(), instance_type)
     return result
@@ -33,7 +44,7 @@ def validate_region_zone(
         with ux_utils.print_exception_no_traceback():
             raise ValueError('Seeweb does not support zones.')
-    result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
+    result = common.validate_region_zone_impl('Seeweb', _get_df(), region, zone)
     return result
@@ -46,14 +57,15 @@ def get_hourly_cost(instance_type: str,
         with ux_utils.print_exception_no_traceback():
             raise ValueError('Seeweb does not support zones.')
-    result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
-                                         zone)
+    result = common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
+                                         region, zone)
     return result
 def get_vcpus_mem_from_instance_type(
         instance_type: str) -> Tuple[Optional[float], Optional[float]]:
-    result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
+    result = common.get_vcpus_mem_from_instance_type_impl(
+        _get_df(), instance_type)
     return result
@@ -64,7 +76,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
                               region: Optional[str] = None,
                               zone: Optional[str] = None) -> Optional[str]:
     del disk_tier  # unused
-    result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
+    result = common.get_instance_type_for_cpus_mem_impl(_get_df(), cpus, memory,
                                                         region, zone)
     return result
@@ -72,7 +84,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
 def get_accelerators_from_instance_type(
         instance_type: str) -> Optional[Dict[str, int]]:
     # Filter the dataframe for the specific instance type
-    df_filtered = _df[_df['InstanceType'] == instance_type]
+    df = _get_df()
+    df_filtered = df[df['InstanceType'] == instance_type]
     if df_filtered.empty:
         return None
@@ -114,7 +127,7 @@ def get_instance_type_for_accelerator(
         with ux_utils.print_exception_no_traceback():
             raise ValueError('Seeweb does not support zones.')
-    result = common.get_instance_type_for_accelerator_impl(df=_df,
+    result = common.get_instance_type_for_accelerator_impl(df=_get_df(),
                                                            acc_name=acc_name,
                                                            acc_count=acc_count,
                                                            cpus=cpus,
@@ -126,7 +139,7 @@ def get_instance_type_for_accelerator(
 def regions() -> List['cloud.Region']:
-    result = common.get_region_zones(_df, use_spot=False)
+    result = common.get_region_zones(_get_df(), use_spot=False)
     return result
@@ -135,7 +148,8 @@ def get_region_zones_for_instance_type(instance_type: str,
                                       ) -> List['cloud.Region']:
     """Returns a list of regions for a given instance type."""
     # Filter the dataframe for the specific instance type
-    df_filtered = _df[_df['InstanceType'] == instance_type]
+    df = _get_df()
+    df_filtered = df[df['InstanceType'] == instance_type]
     if df_filtered.empty:
         return []
@@ -174,7 +188,8 @@ def list_accelerators(
         require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
     """Lists accelerators offered in Seeweb."""
     # Filter out rows with empty or null regions (indicating unavailability)
-    df_filtered = _df.dropna(subset=['Region'])
+    df = _get_df()
+    df_filtered = df.dropna(subset=['Region'])
     df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
     result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,

sky/catalog/shadeform_catalog.py CHANGED Viewed

@@ -7,12 +7,15 @@ and can be used to query instance types and pricing information for Shadeform.
 import typing
 from typing import Dict, List, Optional, Tuple, Union
-import pandas as pd
+from sky.adaptors import common as adaptors_common
 from sky.catalog import common
 if typing.TYPE_CHECKING:
+    import pandas as pd
     from sky.clouds import cloud
+else:
+    pd = adaptors_common.LazyImport('pandas')
 # We'll use dynamic fetching, so no static CSV file to load
 _df = None

skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl