PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +1 -61
sky/adaptors/slurm.py +565 -0
sky/backends/backend_utils.py +95 -12
sky/backends/cloud_vm_ray_backend.py +224 -65
sky/backends/task_codegen.py +380 -4
sky/catalog/__init__.py +0 -3
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/kubernetes_catalog.py +12 -4
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +236 -0
sky/catalog/vast_catalog.py +30 -6
sky/check.py +25 -11
sky/client/cli/command.py +391 -32
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +64 -2
sky/client/sdk_async.py +9 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/cloud.py +7 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +610 -0
sky/clouds/ssh.py +3 -2
sky/clouds/vast.py +39 -16
sky/core.py +197 -37
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +44 -5
sky/global_user_state.py +111 -19
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +11 -0
sky/optimizer.py +8 -6
sky/provision/__init__.py +12 -9
sky/provision/common.py +20 -0
sky/provision/docker_utils.py +15 -2
sky/provision/kubernetes/utils.py +163 -20
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +17 -7
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +618 -0
sky/provision/slurm/utils.py +689 -0
sky/provision/vast/instance.py +4 -1
sky/provision/vast/utils.py +11 -6
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/serve/server/impl.py +1 -1
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +238 -0
sky/server/requests/executor.py +5 -2
sky/server/requests/payloads.py +30 -1
sky/server/requests/request_names.py +4 -0
sky/server/requests/requests.py +33 -11
sky/server/requests/serializers/encoders.py +22 -0
sky/server/requests/serializers/return_value_serializers.py +70 -0
sky/server/server.py +506 -109
sky/server/server_utils.py +30 -0
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +22 -9
sky/sky_logging.py +2 -1
sky/skylet/attempt_skylet.py +13 -3
sky/skylet/constants.py +55 -13
sky/skylet/events.py +10 -4
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +187 -0
sky/skylet/job_lib.py +91 -5
sky/skylet/log_lib.py +22 -6
sky/skylet/log_lib.pyi +8 -6
sky/skylet/services.py +18 -3
sky/skylet/skylet.py +5 -1
sky/skylet/subprocess_daemon.py +2 -1
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +11 -13
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/templates/kubernetes-ray.yml.j2 +12 -6
sky/templates/slurm-ray.yml.j2 +115 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +18 -41
sky/users/model.conf +1 -1
sky/users/permission.py +85 -52
sky/users/rbac.py +31 -3
sky/utils/annotations.py +108 -8
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +389 -35
sky/utils/command_runner.pyi +43 -4
sky/utils/common_utils.py +47 -31
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +93 -19
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
/sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
 DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
 MEMORY_SIZE_UNITS = {
+    'm': 0.001,
     'B': 1,
     'K': 2**10,
     'M': 2**20,
@@ -1205,15 +1206,24 @@ class V1NodeAddress:
     address: str
+@dataclasses.dataclass
+class V1NodeCondition:
+    """Represents a Kubernetes node condition."""
+    type: str
+    status: str
 @dataclasses.dataclass
 class V1NodeStatus:
     allocatable: Dict[str, str]
     capacity: Dict[str, str]
     addresses: List[V1NodeAddress]
+    conditions: List[V1NodeCondition]
 @dataclasses.dataclass
 class V1Node:
+    """Represents a Kubernetes node."""
     metadata: V1ObjectMeta
     status: V1NodeStatus
@@ -1231,8 +1241,24 @@ class V1Node:
                            V1NodeAddress(type=addr['type'],
                                          address=addr['address'])
                            for addr in data['status'].get('addresses', [])
+                       ],
+                       conditions=[
+                           V1NodeCondition(type=cond['type'],
+                                           status=cond['status'])
+                           for cond in data['status'].get('conditions', [])
                        ]))
+    def is_ready(self) -> bool:
+        """Check if the node is ready based on its conditions.
+        A node is considered ready if it has a 'Ready' condition with
+        status 'True'.
+        """
+        for condition in self.status.conditions:
+            if condition.type == 'Ready':
+                return condition.status == 'True'
+        return False
 @annotations.lru_cache(scope='request', maxsize=10)
 @_retry_on_error(resource_type='node')
@@ -1306,12 +1332,20 @@ class V1Pod:
 @_retry_on_error(resource_type='pod')
-def get_allocated_gpu_qty_by_node(
+def get_allocated_resources_by_node(
     *,
     context: Optional[str] = None,
-) -> Dict[str, int]:
-    """Gets allocated GPU quantity by each node by fetching pods in
+) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
+    """Gets allocated GPU, CPU, and memory by each node by fetching pods in
     all namespaces in kubernetes cluster indicated by context.
+    This function combines GPU and CPU/memory allocation tracking into a single
+    API call for better performance.
+    Returns:
+        Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
+        - allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
+        - allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
     """
     if context is None:
         context = get_current_kube_config_context_name()
@@ -1330,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
         field_selector=field_selector)
     try:
         allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+        allocated_cpu_memory_by_node: Dict[str, Tuple[
+            float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
         for item_dict in ijson.items(response,
                                      'items.item',
                                      buf_size=IJSON_BUFFER_SIZE):
             pod = V1Pod.from_dict(item_dict)
             if should_exclude_pod_from_gpu_allocation(pod):
                 logger.debug(
-                    f'Excluding pod {pod.metadata.name} from GPU count '
+                    f'Excluding pod {pod.metadata.name} from resource count '
                     f'calculations on node {pod.spec.node_name}')
                 continue
-            # Iterate over all the containers in the pod and sum the
-            # GPU requests
+            if not pod.spec.node_name:
+                continue
+            # Iterate over all the containers in the pod and sum the resources
             pod_allocated_qty = 0
+            pod_allocated_cpu = 0.0
+            pod_allocated_memory_gb = 0.0
             for container in pod.spec.containers:
                 if container.resources.requests:
+                    requests = container.resources.requests
+                    # Parse GPU
                     pod_allocated_qty += get_node_accelerator_count(
-                        context, container.resources.requests)
-            if pod_allocated_qty > 0 and pod.spec.node_name:
+                        context, requests)
+                    # Parse CPU
+                    if 'cpu' in requests:
+                        pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
+                            requests['cpu'])
+                    # Parse memory
+                    if 'memory' in requests:
+                        pod_allocated_memory_gb += parse_memory_resource(
+                            requests['memory'], unit='G')
+            if pod_allocated_qty > 0:
                 allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
-        return allocated_qty_by_node
+            if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
+                current_cpu, current_memory = allocated_cpu_memory_by_node[
+                    pod.spec.node_name]
+                allocated_cpu_memory_by_node[pod.spec.node_name] = (
+                    current_cpu + pod_allocated_cpu,
+                    current_memory + pod_allocated_memory_gb)
+        return allocated_qty_by_node, allocated_cpu_memory_by_node
     finally:
         response.release_conn()
+@_retry_on_error(resource_type='pod')
+def get_allocated_gpu_qty_by_node(
+    *,
+    context: Optional[str] = None,
+) -> Dict[str, int]:
+    """Gets allocated GPU quantity by each node by fetching pods in
+    all namespaces in kubernetes cluster indicated by context.
+    Note: For better performance when you also need CPU/memory allocation,
+    use get_allocated_resources_by_node() instead.
+    """
+    allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
+    return allocated_qty_by_node
 def check_instance_fits(context: Optional[str],
                         instance: str) -> Tuple[bool, Optional[str]]:
     """Checks if the instance fits on the Kubernetes cluster.
@@ -1451,11 +1523,12 @@ def check_instance_fits(context: Optional[str],
             return False, str(e)
         # Get the set of nodes that have the GPU type
         gpu_nodes = [
-            node for node in nodes if gpu_label_key in node.metadata.labels and
+            node for node in nodes
+            if node.is_ready() and gpu_label_key in node.metadata.labels and
             node.metadata.labels[gpu_label_key] in gpu_label_values
         ]
         if not gpu_nodes:
-            return False, f'No GPU nodes found with {acc_type} on the cluster'
+            return False, f'No ready GPU nodes found with {acc_type} on the cluster'
         if is_tpu_on_gke(acc_type):
             # If requested accelerator is a TPU type, check if the cluster
             # has sufficient TPU resource to meet the requirement.
@@ -1479,7 +1552,9 @@ def check_instance_fits(context: Optional[str],
             f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
             f'memory (> {k8s_instance_type.memory} G). ')
     else:
-        candidate_nodes = nodes
+        candidate_nodes = [node for node in nodes if node.is_ready()]
+        if not candidate_nodes:
+            return False, 'No ready nodes found in the cluster.'
         not_fit_reason_prefix = (f'No nodes found with enough '
                                  f'CPU (> {k8s_instance_type.cpus} CPUs) '
                                  'and/or memory '
@@ -2161,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
         _, current_context = kubernetes.list_kube_config_contexts()
         return current_context['name']
     except k8s.config.config_exception.ConfigException:
+        # If kubeconfig is not available, check if running in-cluster and
+        # return the in-cluster context name. This is needed when kubeconfig
+        # is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
+        # but we still need to know the context name for operations like
+        # port mode detection.
+        if is_incluster_config_available():
+            return kubernetes.in_cluster_context_name()
         return None
@@ -2285,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
     try:
         bytes_value = int(resource_str)
     except ValueError:
-        memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
+        memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
         number, unit_index = [item.strip() for item in memory_size.split()]
         unit_index = unit_index[0]
         bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
@@ -3033,16 +3115,32 @@ def get_kubernetes_node_info(
             has_accelerator_nodes = True
             break
-    # Get the allocated GPU quantity by each node
+    # Get the allocated resources (GPU, CPU, memory) by each node in a single call
     allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
-    error_on_get_allocated_gpu_qty_by_node = False
+    allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
+    error_on_get_allocated_resources = False
+    # Get resource allocation. For GPU allocation, only call if there are GPU nodes
+    # (same as master branch). For CPU/memory, we always need it for all nodes.
     if has_accelerator_nodes:
+        # When there are GPU nodes, get both GPU and CPU/memory in one call
         try:
-            allocated_qty_by_node = get_allocated_gpu_qty_by_node(
+            allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
                 context=context)
         except kubernetes.api_exception() as e:
             if e.status == 403:
-                error_on_get_allocated_gpu_qty_by_node = True
+                error_on_get_allocated_resources = True
+                pass
+            else:
+                raise
+    else:
+        # When there are no GPU nodes, we still need CPU/memory allocation
+        # This is an extra API call compared to master branch
+        try:
+            _, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
+                context=context)
+        except kubernetes.api_exception() as e:
+            if e.status == 403:
+                error_on_get_allocated_resources = True
                 pass
             else:
                 raise
@@ -3078,16 +3176,56 @@ def get_kubernetes_node_info(
         accelerator_count = get_node_accelerator_count(context,
                                                        node.status.allocatable)
+        # Parse CPU and memory from node capacity
+        cpu_count = None
+        memory_gb = None
+        try:
+            if 'cpu' in node.status.capacity:
+                cpu_count = float(
+                    parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
+            if 'memory' in node.status.capacity:
+                memory_gb = parse_memory_resource(
+                    node.status.capacity['memory'], unit='G')
+        except (KeyError, ValueError) as e:
+            # If parsing fails, log but continue
+            logger.debug(f'Failed to parse CPU/memory for node '
+                         f'{node.metadata.name}: {e}')
+        # Calculate free CPU and memory
+        cpu_free = None
+        memory_free_gb = None
+        if cpu_count is not None or memory_gb is not None:
+            if not error_on_get_allocated_resources:
+                allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
+                    node.metadata.name, (0.0, 0.0))
+                if cpu_count is not None:
+                    cpu_free = max(0.0, cpu_count - allocated_cpu)
+                if memory_gb is not None:
+                    memory_free_gb = max(0.0, memory_gb - allocated_memory)
+            # If we can't get allocation info, set free to None (unknown)
+        # Check if node is ready
+        node_is_ready = node.is_ready()
         if accelerator_count == 0:
             node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
                 name=node.metadata.name,
                 accelerator_type=accelerator_name,
                 total={'accelerator_count': 0},
                 free={'accelerators_available': 0},
-                ip_address=node_ip)
+                ip_address=node_ip,
+                cpu_count=cpu_count,
+                memory_gb=memory_gb,
+                cpu_free=cpu_free,
+                memory_free_gb=memory_free_gb,
+                is_ready=node_is_ready)
             continue
-        if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
+        if not node_is_ready:
+            # If node is not ready, report 0 available GPUs
+            accelerators_available = 0
+        elif not has_accelerator_nodes or error_on_get_allocated_resources:
             accelerators_available = -1
         else:
             allocated_qty = allocated_qty_by_node[node.metadata.name]
@@ -3105,7 +3243,12 @@ def get_kubernetes_node_info(
             accelerator_type=accelerator_name,
             total={'accelerator_count': int(accelerator_count)},
             free={'accelerators_available': int(accelerators_available)},
-            ip_address=node_ip)
+            ip_address=node_ip,
+            cpu_count=cpu_count,
+            memory_gb=memory_gb,
+            cpu_free=cpu_free,
+            memory_free_gb=memory_free_gb,
+            is_ready=node_is_ready)
     hint = ''
     if has_multi_host_tpu:
         hint = ('(Note: Multi-host TPUs are detected and excluded from the '

sky/provision/kubernetes/volume.py CHANGED Viewed

@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
             continue
         pvc = kubernetes.core_api(
             context).read_namespaced_persistent_volume_claim(
-                name=pvc_name, namespace=namespace)
+                name=pvc_name,
+                namespace=namespace,
+                _request_timeout=kubernetes.API_TIMEOUT)
         access_mode = pvc.spec.access_modes[0]
         if access_mode not in once_modes:
             continue
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
     if storage_class_name is not None:
         try:
             kubernetes.storage_api(context).read_storage_class(
-                name=storage_class_name)
+                name=storage_class_name,
+                _request_timeout=kubernetes.API_TIMEOUT)
         except kubernetes.api_exception() as e:
             raise config_lib.KubernetesError(
                 f'Check storage class {storage_class_name} error: {e}')
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
             context).delete_namespaced_persistent_volume_claim(
                 name=pvc_name,
                 namespace=namespace,
-                _request_timeout=config_lib.DELETION_TIMEOUT),
+                _request_timeout=kubernetes.API_TIMEOUT),
         resource_type='pvc',
         resource_name=pvc_name)
     logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
@@ -119,7 +122,9 @@ def _get_volume_usedby(
     cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
     # Get all pods in the namespace
     pods = kubernetes.core_api(context).list_namespaced_pod(
-        namespace=namespace, field_selector=field_selector)
+        namespace=namespace,
+        field_selector=field_selector,
+        _request_timeout=kubernetes.API_TIMEOUT)
     for pod in pods.items:
         if pod.spec.volumes is None:
             continue
@@ -164,8 +169,21 @@ def get_volume_usedby(
 def get_all_volumes_usedby(
     configs: List[models.VolumeConfig],
-) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-    """Gets the usedby resources of all volumes."""
+) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
+    """Gets the usedby resources of all volumes.
+    Args:
+        configs: List of VolumeConfig objects.
+    Returns:
+        usedby_pods: Dictionary of context to namespace to volume name to pods
+                     using the volume. These may include pods not created by
+                     SkyPilot.
+        usedby_clusters: Dictionary of context to namespace to volume name to
+                         clusters using the volume.
+        failed_volume_names: Set of volume names whose usedby info failed to
+          fetch.
+    """
     field_selector = ','.join([
         f'status.phase!={phase}'
         for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
     label_selector = 'parent=skypilot'
     context_to_namespaces: Dict[str, Set[str]] = {}
     pvc_names = set()
+    original_volume_names: Dict[str, Dict[str, List[str]]] = {}
     for config in configs:
         context, namespace = _get_context_namespace(config)
-        if context not in context_to_namespaces:
-            context_to_namespaces[context] = set()
-        context_to_namespaces[context].add(namespace)
+        context_to_namespaces.setdefault(context, set()).add(namespace)
+        original_volume_names.setdefault(context,
+                                         {}).setdefault(namespace,
+                                                        []).append(config.name)
         pvc_names.add(config.name_on_cloud)
     cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
     # Get all pods in the namespace
     used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
     used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+    failed_volume_names: Set[str] = set()
     for context, namespaces in context_to_namespaces.items():
         used_by_pods[context] = {}
         used_by_clusters[context] = {}
         for namespace in namespaces:
             used_by_pods[context][namespace] = {}
             used_by_clusters[context][namespace] = {}
-            pods = kubernetes.core_api(context).list_namespaced_pod(
-                namespace=namespace,
-                field_selector=field_selector,
-                label_selector=label_selector)
+            try:
+                pods = kubernetes.core_api(context).list_namespaced_pod(
+                    namespace=namespace,
+                    field_selector=field_selector,
+                    label_selector=label_selector,
+                    _request_timeout=kubernetes.API_TIMEOUT)
+            except Exception as e:  # pylint: disable=broad-except
+                logger.debug(f'Failed to get pods in namespace {namespace} '
+                             f'in context {context}: {e}')
+                # Mark all volumes in this namespace as failed
+                for original_volume_name in original_volume_names[context][
+                        namespace]:
+                    failed_volume_names.add(original_volume_name)
+                continue
             for pod in pods.items:
                 if pod.spec.volumes is None:
                     continue
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
                         used_by_clusters[context][namespace][cluster_name] = []
                     used_by_clusters[context][namespace][cluster_name].append(
                         cluster_name)
-    return used_by_pods, used_by_clusters
+    return used_by_pods, used_by_clusters, failed_volume_names
 def map_all_volumes_usedby(
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
     try:
         pvc = kubernetes.core_api(
             context).read_namespaced_persistent_volume_claim(
-                name=pvc_name, namespace=namespace)
+                name=pvc_name,
+                namespace=namespace,
+                _request_timeout=kubernetes.API_TIMEOUT)
         if config is not None:
             _populate_config_from_pvc(config, pvc)
         logger.debug(f'PVC {pvc_name} already exists')
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
         raise ValueError(
             f'PVC {pvc_name} does not exist while use_existing is True.')
     pvc = kubernetes.core_api(
-        context).create_namespaced_persistent_volume_claim(namespace=namespace,
-                                                           body=pvc_spec)
+        context).create_namespaced_persistent_volume_claim(
+            namespace=namespace,
+            body=pvc_spec,
+            _request_timeout=kubernetes.API_TIMEOUT)
     logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
     if config is not None:
         _populate_config_from_pvc(config, pvc)

sky/provision/provisioner.py CHANGED Viewed

@@ -157,9 +157,9 @@ def bulk_provision(
             logger.debug(f'SkyPilot version: {sky.__version__}; '
                          f'commit: {sky.__commit__}')
             logger.debug(_TITLE.format('Provisioning'))
-            logger.debug(
-                'Provision config:\n'
-                f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
+            redacted_config = bootstrap_config.get_redacted_config()
+            logger.debug('Provision config:\n'
+                         f'{json.dumps(redacted_config, indent=2)}')
             return _bulk_provision(cloud, region, cluster_name,
                                    bootstrap_config)
         except exceptions.NoClusterLaunchedError:
@@ -493,7 +493,8 @@ def _post_provision_setup(
         # commands and rsync on the pods. SSH will still be ready after a while
         # for the users to SSH into the pod.
         is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
-        if not is_k8s_cloud:
+        is_slurm_cloud = cloud_name.lower() == 'slurm'
+        if not is_k8s_cloud and not is_slurm_cloud:
             logger.debug(
                 f'\nWaiting for SSH to be available for {cluster_name!r} ...')
             wait_for_ssh(cluster_info, ssh_credentials)
@@ -635,10 +636,15 @@ def _post_provision_setup(
         status.update(
             runtime_preparation_str.format(step=3, step_name='runtime'))
+        skip_ray_setup = False
         ray_port = constants.SKY_REMOTE_RAY_PORT
         head_ray_needs_restart = True
         ray_cluster_healthy = False
-        if (not provision_record.is_instance_just_booted(
+        if (launched_resources.cloud is not None and
+                not launched_resources.cloud.uses_ray()):
+            skip_ray_setup = True
+            logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
+        elif (not provision_record.is_instance_just_booted(
                 head_instance.instance_id)):
             # Check if head node Ray is alive
             (ray_port, ray_cluster_healthy,
@@ -663,7 +669,9 @@ def _post_provision_setup(
                              'async setup to complete...')
                 time.sleep(1)
-        if head_ray_needs_restart:
+        if skip_ray_setup:
+            logger.debug('Skip Ray cluster setup on the head node.')
+        elif head_ray_needs_restart:
             logger.debug('Starting Ray on the entire cluster.')
             instance_setup.start_ray_on_head_node(
                 cluster_name.name_on_cloud,
@@ -686,7 +694,9 @@ def _post_provision_setup(
         # We don't need to restart ray on worker nodes if the ray cluster is
         # already healthy, i.e. the head node has expected number of nodes
         # connected to the ray cluster.
-        if cluster_info.num_instances > 1 and not ray_cluster_healthy:
+        if skip_ray_setup:
+            logger.debug('Skip Ray cluster setup on the worker nodes.')
+        elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
             instance_setup.start_ray_on_worker_nodes(
                 cluster_name.name_on_cloud,
                 no_restart=not head_ray_needs_restart,

sky/provision/runpod/instance.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """RunPod instance provisioning."""
 import time
+import traceback
 from typing import Any, Dict, List, Optional, Tuple
 from sky import sky_logging
@@ -116,7 +117,8 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                 volume_mount_path=volume_mount_path,
             )
         except Exception as e:  # pylint: disable=broad-except
-            logger.warning(f'run_instances error: {e}')
+            logger.warning(f'run_instances error: {e}\n'
+                           f'Full traceback:\n{traceback.format_exc()}')
             raise
         logger.info(f'Launched instance {instance_id}.')
         created_instance_ids.append(instance_id)

sky/provision/runpod/utils.py CHANGED Viewed

@@ -80,7 +80,11 @@ def _construct_docker_login_template_name(cluster_name: str) -> str:
 def retry(func):
-    """Decorator to retry a function."""
+    """Decorator to retry a function.
+    Only retries on transient errors. Does not retry on authorization errors
+    (Unauthorized, Forbidden) as these are not recoverable.
+    """
     def wrapper(*args, **kwargs):
         """Wrapper for retrying a function."""
@@ -89,6 +93,14 @@ def retry(func):
             try:
                 return func(*args, **kwargs)
             except runpod.runpod.error.QueryError as e:
+                error_msg = str(e).lower()
+                # Don't retry on authorization errors - these won't recover
+                auth_keywords = ['unauthorized', 'forbidden', '401', '403']
+                if any(keyword in error_msg for keyword in auth_keywords):
+                    logger.error(f'RunPod authorization error (not retrying): '
+                                 f'{common_utils.format_exception(e)}')
+                    raise
+                cnt += 1
                 if cnt >= 3:
                     raise
                 logger.warning('Retrying for exception: '

sky/provision/runpod/volume.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """RunPod network volume provisioning."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 from sky import global_user_state
 from sky import models
@@ -194,15 +194,31 @@ def get_volume_usedby(
 def get_all_volumes_usedby(
     configs: List[models.VolumeConfig],
-) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-    """Gets the usedby resources of all volumes."""
-    used_by_results = [get_volume_usedby(config) for config in configs]
+) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
+    """Gets the usedby resources of all volumes.
+    Args:
+        configs: List of VolumeConfig objects.
+    Returns:
+        usedby_pods: Dictionary of volume name to pods using the volume.
+        usedby_clusters: Dictionary of volume name to clusters using the volume.
+        failed_volume_names: Set of volume names whose usedby info failed to
+          fetch.
+    """
     used_by_pods, used_by_clusters = {}, {}
-    for i in range(len(configs)):
-        config = configs[i]
-        used_by_pods[config.name_on_cloud] = used_by_results[i][0]
-        used_by_clusters[config.name_on_cloud] = used_by_results[i][1]
-    return used_by_pods, used_by_clusters
+    failed_volume_names = set()
+    for config in configs:
+        try:
+            usedby_pods, usedby_clusters = get_volume_usedby(config)
+            used_by_pods[config.name_on_cloud] = usedby_pods
+            used_by_clusters[config.name_on_cloud] = usedby_clusters
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Failed to get usedby info for RunPod volume '
+                         f'{config.name}: {e}')
+            failed_volume_names.add(config.name)
+            continue
+    return used_by_pods, used_by_clusters, failed_volume_names
 def map_all_volumes_usedby(

sky/provision/slurm/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Slurm provisioner for SkyPilot."""
+from sky.provision.slurm.config import bootstrap_instances
+from sky.provision.slurm.instance import cleanup_ports
+from sky.provision.slurm.instance import get_cluster_info
+from sky.provision.slurm.instance import get_command_runners
+from sky.provision.slurm.instance import open_ports
+from sky.provision.slurm.instance import query_instances
+from sky.provision.slurm.instance import run_instances
+from sky.provision.slurm.instance import stop_instances
+from sky.provision.slurm.instance import terminate_instances
+from sky.provision.slurm.instance import wait_instances

sky/provision/slurm/config.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Slrum-specific configuration for the provisioner."""
+import logging
+from sky.provision import common
+logger = logging.getLogger(__name__)
+def bootstrap_instances(
+        region: str, cluster_name: str,
+        config: common.ProvisionConfig) -> common.ProvisionConfig:
+    del region, cluster_name  # unused
+    return config

skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251203py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl