PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

sky/__init__.py +4 -2
sky/adaptors/slurm.py +159 -72
sky/backends/backend_utils.py +52 -10
sky/backends/cloud_vm_ray_backend.py +192 -32
sky/backends/task_codegen.py +40 -2
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +0 -7
sky/catalog/vast_catalog.py +30 -6
sky/check.py +11 -8
sky/client/cli/command.py +106 -54
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +8 -0
sky/client/sdk_async.py +9 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +44 -12
sky/clouds/ssh.py +1 -1
sky/clouds/vast.py +30 -17
sky/core.py +69 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +29 -4
sky/global_user_state.py +108 -16
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +9 -0
sky/optimizer.py +2 -1
sky/provision/__init__.py +11 -9
sky/provision/kubernetes/utils.py +122 -15
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +2 -1
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/instance.py +75 -29
sky/provision/slurm/utils.py +213 -107
sky/provision/vast/utils.py +1 -0
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +16 -0
sky/server/requests/payloads.py +18 -0
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +28 -10
sky/server/requests/serializers/encoders.py +5 -0
sky/server/requests/serializers/return_value_serializers.py +14 -4
sky/server/server.py +434 -107
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +21 -10
sky/sky_logging.py +2 -1
sky/skylet/constants.py +22 -5
sky/skylet/executor/slurm.py +4 -6
sky/skylet/job_lib.py +89 -4
sky/skylet/services.py +18 -3
sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/templates/kubernetes-ray.yml.j2 +4 -6
sky/templates/slurm-ray.yml.j2 +32 -2
sky/templates/websocket_proxy.py +18 -41
sky/users/permission.py +61 -51
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +256 -94
sky/utils/command_runner.pyi +16 -0
sky/utils/common_utils.py +30 -29
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +63 -20
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
/sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -80,9 +80,8 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
 _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
-_JOB_STATUS_FETCH_MAX_RETRIES = 3
-_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
 _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
+JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS = 60
 _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
     'Waiting for task to start[/]'
@@ -329,13 +328,21 @@ def ha_recovery_for_consolidation_mode() -> None:
 async def get_job_status(
-        backend: 'backends.CloudVmRayBackend', cluster_name: str,
-        job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
+    backend: 'backends.CloudVmRayBackend', cluster_name: str,
+    job_id: Optional[int]
+) -> Tuple[Optional['job_lib.JobStatus'], Optional[str]]:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
     FAILED_SETUP or CANCELLED.
+    Returns:
+        job_status: The status of the job.
+        transient_error_reason: None if successful or fatal error; otherwise,
+            the detailed reason for the transient error.
     """
+    # TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
+    # that it can exit retry early if the cluster is down.
     # TODO(luca) make this async
     handle = await context_utils.to_thread(
         global_user_state.get_handle_from_cluster_name, cluster_name)
@@ -343,85 +350,68 @@ async def get_job_status(
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
         logger.info(f'Cluster {cluster_name} not found.')
-        return None
+        return None, None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
-    for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
-        try:
-            logger.info('=== Checking the job status... ===')
-            statuses = await asyncio.wait_for(
-                context_utils.to_thread(backend.get_job_status,
-                                        handle,
-                                        job_ids=job_ids,
-                                        stream_logs=False),
-                timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
-            status = list(statuses.values())[0]
-            if status is None:
-                logger.info('No job found.')
+    try:
+        logger.info('=== Checking the job status... ===')
+        statuses = await asyncio.wait_for(
+            context_utils.to_thread(backend.get_job_status,
+                                    handle,
+                                    job_ids=job_ids,
+                                    stream_logs=False),
+            timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
+        status = list(statuses.values())[0]
+        if status is None:
+            logger.info('No job found.')
+        else:
+            logger.info(f'Job status: {status}')
+        logger.info('=' * 34)
+        return status, None
+    except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
+            ValueError, TypeError, asyncio.TimeoutError) as e:
+        # Note: Each of these exceptions has some additional conditions to
+        # limit how we handle it and whether or not we catch it.
+        potential_transient_error_reason = None
+        if isinstance(e, exceptions.CommandError):
+            returncode = e.returncode
+            potential_transient_error_reason = (f'Returncode: {returncode}. '
+                                                f'{e.detailed_reason}')
+        elif isinstance(e, grpc.RpcError):
+            potential_transient_error_reason = e.details()
+        elif isinstance(e, grpc.FutureTimeoutError):
+            potential_transient_error_reason = 'grpc timeout'
+        elif isinstance(e, asyncio.TimeoutError):
+            potential_transient_error_reason = (
+                'Job status check timed out after '
+                f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
+        # TODO(cooperc): Gracefully handle these exceptions in the backend.
+        elif isinstance(e, ValueError):
+            # If the cluster yaml is deleted in the middle of getting the
+            # SSH credentials, we could see this. See
+            # sky/global_user_state.py get_cluster_yaml_dict.
+            if re.search(r'Cluster yaml .* not found', str(e)):
+                potential_transient_error_reason = 'Cluster yaml was deleted'
             else:
-                logger.info(f'Job status: {status}')
-            logger.info('=' * 34)
-            return status
-        except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
-                ValueError, TypeError, asyncio.TimeoutError) as e:
-            # Note: Each of these exceptions has some additional conditions to
-            # limit how we handle it and whether or not we catch it.
-            # Retry on k8s transient network errors. This is useful when using
-            # coreweave which may have transient network issue sometimes.
-            is_transient_error = False
-            detailed_reason = None
-            if isinstance(e, exceptions.CommandError):
-                detailed_reason = e.detailed_reason
-                if (detailed_reason is not None and
-                        _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
-                    is_transient_error = True
-            elif isinstance(e, grpc.RpcError):
-                detailed_reason = e.details()
-                if e.code() in [
-                        grpc.StatusCode.UNAVAILABLE,
-                        grpc.StatusCode.DEADLINE_EXCEEDED
-                ]:
-                    is_transient_error = True
-            elif isinstance(e, grpc.FutureTimeoutError):
-                detailed_reason = 'Timeout'
-            elif isinstance(e, asyncio.TimeoutError):
-                detailed_reason = ('Job status check timed out after '
-                                   f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
-            # TODO(cooperc): Gracefully handle these exceptions in the backend.
-            elif isinstance(e, ValueError):
-                # If the cluster yaml is deleted in the middle of getting the
-                # SSH credentials, we could see this. See
-                # sky/global_user_state.py get_cluster_yaml_dict.
-                if re.search(r'Cluster yaml .* not found', str(e)):
-                    detailed_reason = 'Cluster yaml was deleted'
-                else:
-                    raise
-            elif isinstance(e, TypeError):
-                # We will grab the SSH credentials from the cluster yaml, but if
-                # handle.cluster_yaml is None, we will just return an empty dict
-                # for the credentials. See
-                # backend_utils.ssh_credential_from_yaml. Then, the credentials
-                # are passed as kwargs to SSHCommandRunner.__init__ - see
-                # cloud_vm_ray_backend.get_command_runners. So we can hit this
-                # TypeError if the cluster yaml is removed from the handle right
-                # when we pull it before the cluster is fully deleted.
-                error_msg_to_check = (
-                    'SSHCommandRunner.__init__() missing 2 required positional '
-                    'arguments: \'ssh_user\' and \'ssh_private_key\'')
-                if str(e) == error_msg_to_check:
-                    detailed_reason = 'SSH credentials were already cleaned up'
-                else:
-                    raise
-            if is_transient_error:
-                logger.info('Failed to connect to the cluster. Retrying '
-                            f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
-                logger.info('=' * 34)
-                await asyncio.sleep(1)
+                raise
+        elif isinstance(e, TypeError):
+            # We will grab the SSH credentials from the cluster yaml, but if
+            # handle.cluster_yaml is None, we will just return an empty dict
+            # for the credentials. See
+            # backend_utils.ssh_credential_from_yaml. Then, the credentials
+            # are passed as kwargs to SSHCommandRunner.__init__ - see
+            # cloud_vm_ray_backend.get_command_runners. So we can hit this
+            # TypeError if the cluster yaml is removed from the handle right
+            # when we pull it before the cluster is fully deleted.
+            error_msg_to_check = (
+                'SSHCommandRunner.__init__() missing 2 required positional '
+                'arguments: \'ssh_user\' and \'ssh_private_key\'')
+            if str(e) == error_msg_to_check:
+                potential_transient_error_reason = ('SSH credentials were '
+                                                    'already cleaned up')
             else:
-                logger.info(f'Failed to get job status: {detailed_reason}')
-                logger.info('=' * 34)
-                return None
-    return None
+                raise
+        return None, potential_transient_error_reason
 def controller_process_alive(record: managed_job_state.ControllerPidRecord,
@@ -1570,6 +1560,7 @@ def get_managed_job_queue(
                     handle.launched_resources.region,
                     handle.launched_resources.zone).formatted_str()
                 job['accelerators'] = handle.launched_resources.accelerators
+                job['labels'] = handle.launched_resources.labels
             else:
                 # FIXME(zongheng): display the last cached values for these.
                 job['cluster_resources'] = '-'
@@ -1578,6 +1569,7 @@ def get_managed_job_queue(
                 job['region'] = '-'
                 job['zone'] = '-'
                 job['infra'] = '-'
+                job['labels'] = None
         if not fields or 'details' in fields:
             # Add details about schedule state / backoff.
@@ -1821,7 +1813,8 @@ def format_job_table(
             for replica in replica_info:
                 used_by = replica.get('used_by')
                 if used_by is not None:
-                    job_to_worker[used_by] = replica.get('replica_id')
+                    for job_id in used_by:
+                        job_to_worker[job_id] = replica.get('replica_id')
         return job_to_worker
     # Create mapping from job_id to worker replica_id

sky/models.py CHANGED Viewed

@@ -68,6 +68,15 @@ class KubernetesNodeInfo:
     free: Dict[str, int]
     # IP address of the node (external IP preferred, fallback to internal IP)
     ip_address: Optional[str] = None
+    # CPU count (total CPUs available on the node)
+    cpu_count: Optional[float] = None
+    # Memory in GB (total memory available on the node)
+    memory_gb: Optional[float] = None
+    # Free CPU count (free CPUs available on the node after pod allocations)
+    cpu_free: Optional[float] = None
+    # Free memory in GB (free memory available on the node after pod
+    # allocations)
+    memory_free_gb: Optional[float] = None
     # Whether the node is ready (all conditions are satisfied)
     is_ready: bool = True

sky/optimizer.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sky.adaptors import common as adaptors_common
 from sky.clouds import cloud as sky_cloud
 from sky.usage import usage_lib
 from sky.utils import common
+from sky.utils import common_utils
 from sky.utils import env_options
 from sky.utils import log_utils
 from sky.utils import registry
@@ -1290,7 +1291,7 @@ def _check_specified_regions(task: task_lib.Task) -> None:
         msg = f'Task{task_name} requires '
         if region not in existing_contexts:
             if is_ssh:
-                infra_str = f'SSH/{region.lstrip("ssh-")}'
+                infra_str = f'SSH/{common_utils.removeprefix(region, "ssh-")}'
             else:
                 infra_str = f'Kubernetes/{region}'
             logger.warning(f'{infra_str} is not enabled.')

sky/provision/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
 import functools
 import inspect
 import typing
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Set, Tuple, Type
 from sky import models
 from sky import sky_logging
@@ -152,16 +152,18 @@ def get_volume_usedby(
 @_route_to_cloud_impl
 def get_all_volumes_usedby(
     provider_name: str, configs: List[models.VolumeConfig]
-) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-    """Get the usedby of a volume.
+) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
+    """Get the usedby of all volumes.
+    Args:
+        provider_name: Name of the provider.
+        configs: List of VolumeConfig objects.
     Returns:
-        usedby_pods: List of dictionaries, each containing the config keys for
-                     a volume and a key containing pods using the volume.
-                     These may include pods not created by SkyPilot.
-        usedby_clusters: List of dictionaries, each containing the config keys
-                         for a volume and a key containing clusters using
-                         the volume.
+        usedby_pods: Dict of usedby pods.
+        usedby_clusters: Dict of usedby clusters.
+        failed_volume_names: Set of volume names whose usedby info
+          failed to fetch.
     """
     raise NotImplementedError

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
 DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
 MEMORY_SIZE_UNITS = {
+    'm': 0.001,
     'B': 1,
     'K': 2**10,
     'M': 2**20,
@@ -1331,12 +1332,20 @@ class V1Pod:
 @_retry_on_error(resource_type='pod')
-def get_allocated_gpu_qty_by_node(
+def get_allocated_resources_by_node(
     *,
     context: Optional[str] = None,
-) -> Dict[str, int]:
-    """Gets allocated GPU quantity by each node by fetching pods in
+) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
+    """Gets allocated GPU, CPU, and memory by each node by fetching pods in
     all namespaces in kubernetes cluster indicated by context.
+    This function combines GPU and CPU/memory allocation tracking into a single
+    API call for better performance.
+    Returns:
+        Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
+        - allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
+        - allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
     """
     if context is None:
         context = get_current_kube_config_context_name()
@@ -1355,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
         field_selector=field_selector)
     try:
         allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+        allocated_cpu_memory_by_node: Dict[str, Tuple[
+            float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
         for item_dict in ijson.items(response,
                                      'items.item',
                                      buf_size=IJSON_BUFFER_SIZE):
             pod = V1Pod.from_dict(item_dict)
             if should_exclude_pod_from_gpu_allocation(pod):
                 logger.debug(
-                    f'Excluding pod {pod.metadata.name} from GPU count '
+                    f'Excluding pod {pod.metadata.name} from resource count '
                     f'calculations on node {pod.spec.node_name}')
                 continue
-            # Iterate over all the containers in the pod and sum the
-            # GPU requests
+            if not pod.spec.node_name:
+                continue
+            # Iterate over all the containers in the pod and sum the resources
             pod_allocated_qty = 0
+            pod_allocated_cpu = 0.0
+            pod_allocated_memory_gb = 0.0
             for container in pod.spec.containers:
                 if container.resources.requests:
+                    requests = container.resources.requests
+                    # Parse GPU
                     pod_allocated_qty += get_node_accelerator_count(
-                        context, container.resources.requests)
-            if pod_allocated_qty > 0 and pod.spec.node_name:
+                        context, requests)
+                    # Parse CPU
+                    if 'cpu' in requests:
+                        pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
+                            requests['cpu'])
+                    # Parse memory
+                    if 'memory' in requests:
+                        pod_allocated_memory_gb += parse_memory_resource(
+                            requests['memory'], unit='G')
+            if pod_allocated_qty > 0:
                 allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
-        return allocated_qty_by_node
+            if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
+                current_cpu, current_memory = allocated_cpu_memory_by_node[
+                    pod.spec.node_name]
+                allocated_cpu_memory_by_node[pod.spec.node_name] = (
+                    current_cpu + pod_allocated_cpu,
+                    current_memory + pod_allocated_memory_gb)
+        return allocated_qty_by_node, allocated_cpu_memory_by_node
     finally:
         response.release_conn()
+@_retry_on_error(resource_type='pod')
+def get_allocated_gpu_qty_by_node(
+    *,
+    context: Optional[str] = None,
+) -> Dict[str, int]:
+    """Gets allocated GPU quantity by each node by fetching pods in
+    all namespaces in kubernetes cluster indicated by context.
+    Note: For better performance when you also need CPU/memory allocation,
+    use get_allocated_resources_by_node() instead.
+    """
+    allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
+    return allocated_qty_by_node
 def check_instance_fits(context: Optional[str],
                         instance: str) -> Tuple[bool, Optional[str]]:
     """Checks if the instance fits on the Kubernetes cluster.
@@ -2189,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
         _, current_context = kubernetes.list_kube_config_contexts()
         return current_context['name']
     except k8s.config.config_exception.ConfigException:
+        # If kubeconfig is not available, check if running in-cluster and
+        # return the in-cluster context name. This is needed when kubeconfig
+        # is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
+        # but we still need to know the context name for operations like
+        # port mode detection.
+        if is_incluster_config_available():
+            return kubernetes.in_cluster_context_name()
         return None
@@ -2313,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
     try:
         bytes_value = int(resource_str)
     except ValueError:
-        memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
+        memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
         number, unit_index = [item.strip() for item in memory_size.split()]
         unit_index = unit_index[0]
         bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
@@ -3061,16 +3115,32 @@ def get_kubernetes_node_info(
             has_accelerator_nodes = True
             break
-    # Get the allocated GPU quantity by each node
+    # Get the allocated resources (GPU, CPU, memory) by each node in a single call
     allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
-    error_on_get_allocated_gpu_qty_by_node = False
+    allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
+    error_on_get_allocated_resources = False
+    # Get resource allocation. For GPU allocation, only call if there are GPU nodes
+    # (same as master branch). For CPU/memory, we always need it for all nodes.
     if has_accelerator_nodes:
+        # When there are GPU nodes, get both GPU and CPU/memory in one call
         try:
-            allocated_qty_by_node = get_allocated_gpu_qty_by_node(
+            allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
                 context=context)
         except kubernetes.api_exception() as e:
             if e.status == 403:
-                error_on_get_allocated_gpu_qty_by_node = True
+                error_on_get_allocated_resources = True
+                pass
+            else:
+                raise
+    else:
+        # When there are no GPU nodes, we still need CPU/memory allocation
+        # This is an extra API call compared to master branch
+        try:
+            _, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
+                context=context)
+        except kubernetes.api_exception() as e:
+            if e.status == 403:
+                error_on_get_allocated_resources = True
                 pass
             else:
                 raise
@@ -3106,6 +3176,35 @@ def get_kubernetes_node_info(
         accelerator_count = get_node_accelerator_count(context,
                                                        node.status.allocatable)
+        # Parse CPU and memory from node capacity
+        cpu_count = None
+        memory_gb = None
+        try:
+            if 'cpu' in node.status.capacity:
+                cpu_count = float(
+                    parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
+            if 'memory' in node.status.capacity:
+                memory_gb = parse_memory_resource(
+                    node.status.capacity['memory'], unit='G')
+        except (KeyError, ValueError) as e:
+            # If parsing fails, log but continue
+            logger.debug(f'Failed to parse CPU/memory for node '
+                         f'{node.metadata.name}: {e}')
+        # Calculate free CPU and memory
+        cpu_free = None
+        memory_free_gb = None
+        if cpu_count is not None or memory_gb is not None:
+            if not error_on_get_allocated_resources:
+                allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
+                    node.metadata.name, (0.0, 0.0))
+                if cpu_count is not None:
+                    cpu_free = max(0.0, cpu_count - allocated_cpu)
+                if memory_gb is not None:
+                    memory_free_gb = max(0.0, memory_gb - allocated_memory)
+            # If we can't get allocation info, set free to None (unknown)
         # Check if node is ready
         node_is_ready = node.is_ready()
@@ -3116,13 +3215,17 @@ def get_kubernetes_node_info(
                 total={'accelerator_count': 0},
                 free={'accelerators_available': 0},
                 ip_address=node_ip,
+                cpu_count=cpu_count,
+                memory_gb=memory_gb,
+                cpu_free=cpu_free,
+                memory_free_gb=memory_free_gb,
                 is_ready=node_is_ready)
             continue
         if not node_is_ready:
             # If node is not ready, report 0 available GPUs
             accelerators_available = 0
-        elif not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
+        elif not has_accelerator_nodes or error_on_get_allocated_resources:
             accelerators_available = -1
         else:
             allocated_qty = allocated_qty_by_node[node.metadata.name]
@@ -3141,6 +3244,10 @@ def get_kubernetes_node_info(
             total={'accelerator_count': int(accelerator_count)},
             free={'accelerators_available': int(accelerators_available)},
             ip_address=node_ip,
+            cpu_count=cpu_count,
+            memory_gb=memory_gb,
+            cpu_free=cpu_free,
+            memory_free_gb=memory_free_gb,
             is_ready=node_is_ready)
     hint = ''
     if has_multi_host_tpu:

sky/provision/kubernetes/volume.py CHANGED Viewed

@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
             continue
         pvc = kubernetes.core_api(
             context).read_namespaced_persistent_volume_claim(
-                name=pvc_name, namespace=namespace)
+                name=pvc_name,
+                namespace=namespace,
+                _request_timeout=kubernetes.API_TIMEOUT)
         access_mode = pvc.spec.access_modes[0]
         if access_mode not in once_modes:
             continue
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
     if storage_class_name is not None:
         try:
             kubernetes.storage_api(context).read_storage_class(
-                name=storage_class_name)
+                name=storage_class_name,
+                _request_timeout=kubernetes.API_TIMEOUT)
         except kubernetes.api_exception() as e:
             raise config_lib.KubernetesError(
                 f'Check storage class {storage_class_name} error: {e}')
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
             context).delete_namespaced_persistent_volume_claim(
                 name=pvc_name,
                 namespace=namespace,
-                _request_timeout=config_lib.DELETION_TIMEOUT),
+                _request_timeout=kubernetes.API_TIMEOUT),
         resource_type='pvc',
         resource_name=pvc_name)
     logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
@@ -119,7 +122,9 @@ def _get_volume_usedby(
     cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
     # Get all pods in the namespace
     pods = kubernetes.core_api(context).list_namespaced_pod(
-        namespace=namespace, field_selector=field_selector)
+        namespace=namespace,
+        field_selector=field_selector,
+        _request_timeout=kubernetes.API_TIMEOUT)
     for pod in pods.items:
         if pod.spec.volumes is None:
             continue
@@ -164,8 +169,21 @@ def get_volume_usedby(
 def get_all_volumes_usedby(
     configs: List[models.VolumeConfig],
-) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-    """Gets the usedby resources of all volumes."""
+) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
+    """Gets the usedby resources of all volumes.
+    Args:
+        configs: List of VolumeConfig objects.
+    Returns:
+        usedby_pods: Dictionary of context to namespace to volume name to pods
+                     using the volume. These may include pods not created by
+                     SkyPilot.
+        usedby_clusters: Dictionary of context to namespace to volume name to
+                         clusters using the volume.
+        failed_volume_names: Set of volume names whose usedby info failed to
+          fetch.
+    """
     field_selector = ','.join([
         f'status.phase!={phase}'
         for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
     label_selector = 'parent=skypilot'
     context_to_namespaces: Dict[str, Set[str]] = {}
     pvc_names = set()
+    original_volume_names: Dict[str, Dict[str, List[str]]] = {}
     for config in configs:
         context, namespace = _get_context_namespace(config)
-        if context not in context_to_namespaces:
-            context_to_namespaces[context] = set()
-        context_to_namespaces[context].add(namespace)
+        context_to_namespaces.setdefault(context, set()).add(namespace)
+        original_volume_names.setdefault(context,
+                                         {}).setdefault(namespace,
+                                                        []).append(config.name)
         pvc_names.add(config.name_on_cloud)
     cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
     # Get all pods in the namespace
     used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
     used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+    failed_volume_names: Set[str] = set()
     for context, namespaces in context_to_namespaces.items():
         used_by_pods[context] = {}
         used_by_clusters[context] = {}
         for namespace in namespaces:
             used_by_pods[context][namespace] = {}
             used_by_clusters[context][namespace] = {}
-            pods = kubernetes.core_api(context).list_namespaced_pod(
-                namespace=namespace,
-                field_selector=field_selector,
-                label_selector=label_selector)
+            try:
+                pods = kubernetes.core_api(context).list_namespaced_pod(
+                    namespace=namespace,
+                    field_selector=field_selector,
+                    label_selector=label_selector,
+                    _request_timeout=kubernetes.API_TIMEOUT)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.debug(f'Failed to get pods in namespace {namespace} '
+                             f'in context {context}: {e}')
+                # Mark all volumes in this namespace as failed
+                for original_volume_name in original_volume_names[context][
+                        namespace]:
+                    failed_volume_names.add(original_volume_name)
+                continue
             for pod in pods.items:
                 if pod.spec.volumes is None:
                     continue
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
                         used_by_clusters[context][namespace][cluster_name] = []
                     used_by_clusters[context][namespace][cluster_name].append(
                         cluster_name)
-    return used_by_pods, used_by_clusters
+    return used_by_pods, used_by_clusters, failed_volume_names
 def map_all_volumes_usedby(
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
     try:
         pvc = kubernetes.core_api(
             context).read_namespaced_persistent_volume_claim(
-                name=pvc_name, namespace=namespace)
+                name=pvc_name,
+                namespace=namespace,
+                _request_timeout=kubernetes.API_TIMEOUT)
         if config is not None:
             _populate_config_from_pvc(config, pvc)
         logger.debug(f'PVC {pvc_name} already exists')
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
         raise ValueError(
             f'PVC {pvc_name} does not exist while use_existing is True.')
     pvc = kubernetes.core_api(
-        context).create_namespaced_persistent_volume_claim(namespace=namespace,
-                                                           body=pvc_spec)
+        context).create_namespaced_persistent_volume_claim(
+            namespace=namespace,
+            body=pvc_spec,
+            _request_timeout=kubernetes.API_TIMEOUT)
     logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
     if config is not None:
         _populate_config_from_pvc(config, pvc)

sky/provision/provisioner.py CHANGED Viewed

@@ -493,7 +493,8 @@ def _post_provision_setup(
         # commands and rsync on the pods. SSH will still be ready after a while
         # for the users to SSH into the pod.
         is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
-        if not is_k8s_cloud:
+        is_slurm_cloud = cloud_name.lower() == 'slurm'
+        if not is_k8s_cloud and not is_slurm_cloud:
             logger.debug(
                 f'\nWaiting for SSH to be available for {cluster_name!r} ...')
             wait_for_ssh(cluster_info, ssh_credentials)

skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl