PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250922py3-none-any.whl → 1.0.0.dev20250926py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show

sky/__init__.py +2 -2
sky/backends/backend.py +10 -0
sky/backends/backend_utils.py +207 -79
sky/backends/cloud_vm_ray_backend.py +37 -13
sky/backends/local_docker_backend.py +9 -0
sky/client/cli/command.py +112 -53
sky/client/common.py +4 -2
sky/client/sdk.py +17 -7
sky/client/sdk_async.py +4 -2
sky/clouds/kubernetes.py +2 -1
sky/clouds/runpod.py +20 -7
sky/core.py +9 -54
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +19 -10
sky/execution.py +4 -2
sky/global_user_state.py +271 -67
sky/jobs/client/sdk.py +10 -1
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +11 -7
sky/jobs/server/core.py +5 -3
sky/jobs/server/server.py +15 -11
sky/jobs/utils.py +1 -1
sky/logs/agent.py +30 -3
sky/logs/aws.py +9 -19
sky/provision/__init__.py +2 -1
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/cudo/instance.py +2 -2
sky/provision/do/instance.py +2 -2
sky/provision/docker_utils.py +41 -19
sky/provision/fluidstack/instance.py +2 -2
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +134 -8
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -2
sky/provision/primeintellect/instance.py +2 -2
sky/provision/provisioner.py +1 -0
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +2 -2
sky/provision/scp/instance.py +2 -2
sky/provision/seeweb/instance.py +2 -1
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +6 -5
sky/schemas/api/responses.py +2 -1
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +45 -19
sky/serve/replica_managers.py +12 -5
sky/serve/serve_utils.py +5 -7
sky/serve/server/core.py +9 -6
sky/serve/server/impl.py +78 -25
sky/serve/server/server.py +4 -5
sky/serve/service_spec.py +33 -0
sky/server/constants.py +1 -1
sky/server/daemons.py +2 -3
sky/server/requests/executor.py +56 -6
sky/server/requests/payloads.py +32 -8
sky/server/requests/preconditions.py +2 -3
sky/server/rest.py +2 -0
sky/server/server.py +28 -19
sky/server/stream_utils.py +34 -12
sky/setup_files/dependencies.py +5 -2
sky/setup_files/setup.py +44 -44
sky/skylet/constants.py +4 -1
sky/skylet/events.py +42 -0
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +24 -18
sky/usage/usage_lib.py +3 -0
sky/utils/cli_utils/status_utils.py +4 -5
sky/utils/context.py +104 -29
sky/utils/controller_utils.py +7 -6
sky/utils/db/db_utils.py +5 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/kubernetes/create_cluster.sh +13 -28
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
sky/utils/kubernetes_enums.py +5 -0
sky/utils/ux_utils.py +35 -1
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +44 -8
sky/volumes/server/core.py +1 -0
sky/volumes/server/server.py +33 -7
sky/volumes/volume.py +35 -28
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
/sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import config_utils
 from sky.utils import kubernetes_enums
+from sky.utils import rich_utils
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
@@ -302,8 +303,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
         f'code {rc}: {command!r}\nOutput: {stdout}.')
+def _detect_cluster_event_reason_occurred(namespace, context, search_start,
+                                          reason) -> bool:
+    def _convert_to_utc(timestamp):
+        if timestamp.tzinfo is None:
+            return timestamp.replace(tzinfo=datetime.timezone.utc)
+        return timestamp.astimezone(datetime.timezone.utc)
+    def _get_event_timestamp(event):
+        if event.last_timestamp:
+            return event.last_timestamp
+        elif event.metadata.creation_timestamp:
+            return event.metadata.creation_timestamp
+        return None
+    events = kubernetes.core_api(context).list_namespaced_event(
+        namespace=namespace, field_selector=f'reason={reason}')
+    for event in events.items:
+        ts = _get_event_timestamp(event)
+        if ts and _convert_to_utc(ts) > search_start:
+            return True
+    return False
+def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
+    """Detects whether the cluster had a autoscaling event after a
+    specified datetime. This only works when using cluster-autoscaler.
+    Args:
+        namespace: kubernetes namespace
+        context: kubernetes context
+        search_start (datetime.datetime): filter for events that occurred
+            after search_start
+    Returns:
+        A boolean whether the cluster has an autoscaling event or not.
+    """
+    assert namespace is not None
+    try:
+        return _detect_cluster_event_reason_occurred(namespace, context,
+                                                     search_start,
+                                                     'TriggeredScaleUp')
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
+        return False
+def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
+    """Detects whether a kubernetes cluster may have an autoscaling event.
+    This is not a definitive detection. FailedScheduling, which is an
+    event that can occur when not enough resources are present in the cluster,
+    which is a trigger for cluster autoscaling. However, FailedScheduling may
+    have occurred due to other reasons (cluster itself is abnormal).
+    Hence, this should only be used for autoscalers that don't emit the
+    TriggeredScaleUp event, e.g.: Karpenter.
+    Args:
+        namespace: kubernetes namespace
+        context: kubernetes context
+        search_start (datetime.datetime): filter for events that occurred
+            after search_start
+    Returns:
+        A boolean whether the cluster has an autoscaling event or not.
+    """
+    assert namespace is not None
+    try:
+        return _detect_cluster_event_reason_occurred(namespace, context,
+                                                     search_start,
+                                                     'FailedScheduling')
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
+        return False
 @timeline.event
-def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
+def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
+                               cluster_name: str,
+                               create_pods_start: datetime.datetime):
     """Wait for all pods to be scheduled.
     Wait for all pods including jump pod to be scheduled, and if it
@@ -312,6 +394,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     allocated and we can exit.
     If timeout is set to a negative value, this method will wait indefinitely.
+    Will update the spinner message to indicate autoscaling if autoscaling
+    is happening.
     """
     # Create a set of pod names we're waiting for
     if not new_nodes:
@@ -319,6 +404,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     expected_pod_names = {node.metadata.name for node in new_nodes}
     start_time = time.time()
+    # Variables for autoscaler detection
+    autoscaler_type = skypilot_config.get_effective_region_config(
+        cloud='kubernetes',
+        region=context,
+        keys=('autoscaler',),
+        default_value=None)
+    autoscaler_is_set = autoscaler_type is not None
+    use_heuristic_detection = (autoscaler_is_set and
+                               not kubernetes_enums.KubernetesAutoscalerType(
+                                   autoscaler_type).emits_autoscale_event())
+    is_autoscaling = False
     def _evaluate_timeout() -> bool:
         # If timeout is negative, retry indefinitely.
         if timeout < 0:
@@ -328,12 +425,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     while _evaluate_timeout():
         # Get all pods in a single API call using the cluster name label
         # which all pods in new_nodes should share
-        cluster_name = new_nodes[0].metadata.labels[
+        cluster_name_on_cloud = new_nodes[0].metadata.labels[
             k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
         pods = kubernetes.core_api(context).list_namespaced_pod(
             namespace,
             label_selector=
-            f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+            f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
+        ).items
         # Get the set of found pod names and check if we have all expected pods
         found_pod_names = {pod.metadata.name for pod in pods}
@@ -357,6 +455,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
         if all_scheduled:
             return
+        # Check if cluster is autoscaling and update spinner message.
+        # Minor optimization to not query k8s api after autoscaling
+        # event was detected. This is useful because there isn't any
+        # autoscaling complete event.
+        if autoscaler_is_set and not is_autoscaling:
+            if use_heuristic_detection:
+                is_autoscaling = _cluster_maybe_autoscaling(
+                    namespace, context, create_pods_start)
+                msg = 'Kubernetes cluster may be scaling up'
+            else:
+                is_autoscaling = _cluster_had_autoscale_event(
+                    namespace, context, create_pods_start)
+                msg = 'Kubernetes cluster is autoscaling'
+            if is_autoscaling:
+                rich_utils.force_update_status(
+                    ux_utils.spinner_message(f'Launching ({msg})',
+                                             cluster_name=cluster_name))
         time.sleep(1)
     # Handle pod scheduling errors
@@ -761,13 +879,14 @@ def _wait_for_deployment_pod(context,
 @timeline.event
-def _create_pods(region: str, cluster_name_on_cloud: str,
+def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
                  config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Create pods based on the config."""
     provider_config = config.provider_config
     namespace = kubernetes_utils.get_namespace_from_config(provider_config)
     context = kubernetes_utils.get_context_from_config(provider_config)
     pod_spec = copy.deepcopy(config.node_config)
+    create_pods_start = datetime.datetime.now(datetime.timezone.utc)
     to_create_deployment = 'deployment_spec' in pod_spec
     if to_create_deployment:
@@ -1047,7 +1166,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     # Wait until the pods are scheduled and surface cause for error
     # if there is one
-    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
+    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
+                               cluster_name, create_pods_start)
+    # Reset spinner message here because it might have hinted autoscaling
+    # while waiting for pods to schedule.
+    rich_utils.force_update_status(
+        ux_utils.spinner_message('Launching', cluster_name=cluster_name))
     # Wait until the pods and their containers are up and running, and
     # fail early if there is an error
     logger.debug(f'run_instances: waiting for pods to be running (pulling '
@@ -1068,11 +1192,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     )
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
     try:
-        return _create_pods(region, cluster_name_on_cloud, config)
+        return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
     except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
         e_msg = common_utils.format_exception(e).replace('\n', ' ')
         logger.warning('run_instances: Error occurred when creating pods: '
@@ -1238,6 +1362,7 @@ def get_cluster_info(
     running_pods = kubernetes_utils.filter_pods(
         namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
+    logger.debug(f'Running pods: {list(running_pods.keys())}')
     pods: Dict[str, List[common.InstanceInfo]] = {}
     head_pod_name = None
@@ -1276,7 +1401,8 @@ def get_cluster_info(
             assert head_spec is not None, pod
             cpu_request = head_spec.containers[0].resources.requests['cpu']
-    assert cpu_request is not None, 'cpu_request should not be None'
+    assert cpu_request is not None, ('cpu_request should not be None, check '
+                                     'the Pod status')
     ssh_user = 'sky'
     # Use pattern matching to extract SSH user, handling MOTD contamination.

sky/provision/lambda_cloud/instance.py CHANGED Viewed

@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
     return private_ip
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster"""
+    del cluster_name  # unused
     lambda_client = _get_lambda_client()
     pending_status = ['booting']
     while True:

sky/provision/nebius/instance.py CHANGED Viewed

@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
                            f' to be ready.')
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
+    del cluster_name  # unused
     _wait_until_no_pending(region, cluster_name_on_cloud)
     running_instances = _filter_instances(region, cluster_name_on_cloud,
                                           ['RUNNING'])

sky/provision/oci/instance.py CHANGED Viewed

@@ -65,9 +65,10 @@ def query_instances(
 @query_utils.debug_enabled(logger)
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Start instances with bootstrapped configuration."""
+    del cluster_name  # unused
     tags = dict(sorted(copy.deepcopy(config.tags).items()))
     start_time = round(time.time() * 1000)

sky/provision/paperspace/instance.py CHANGED Viewed

@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
     return head_instance_id
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
+    del cluster_name  # unused
     pending_status = [
         'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
     ]

sky/provision/primeintellect/instance.py CHANGED Viewed

@@ -65,10 +65,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
 # Helper is available as utils.parse_ssh_connection.
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
+    del cluster_name  # unused
     pending_status = [
         'PROVISIONING',
         'PENDING',

sky/provision/provisioner.py CHANGED Viewed

@@ -69,6 +69,7 @@ def _bulk_provision(
     provision_record = provision.run_instances(provider_name,
                                                region_name,
+                                               str(cluster_name),
                                                cluster_name.name_on_cloud,
                                                config=config)

sky/provision/runpod/__init__.py CHANGED Viewed

@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
 from sky.provision.runpod.instance import wait_instances
 from sky.provision.runpod.volume import apply_volume
 from sky.provision.runpod.volume import delete_volume
+from sky.provision.runpod.volume import get_all_volumes_usedby
 from sky.provision.runpod.volume import get_volume_usedby
+from sky.provision.runpod.volume import map_all_volumes_usedby

sky/provision/runpod/instance.py CHANGED Viewed

@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
     return head_instance_id
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
+    del cluster_name  # unused
     pending_status = ['CREATED', 'RESTARTING']
     while True:

sky/provision/scp/instance.py CHANGED Viewed

@@ -13,9 +13,9 @@ from sky.utils import status_lib
 logger = logging.getLogger(__name__)
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
+    del cluster_name  # unused
     zone_id = config.node_config['zone_id']
     running_instances = _filter_instances(cluster_name_on_cloud, ['RUNNING'])
     head_instance_id = _get_head_instance_id(running_instances)

sky/provision/seeweb/instance.py CHANGED Viewed

@@ -502,9 +502,10 @@ class SeewebNodeProvider:
 # =============================================================================
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: ProvisionConfig) -> ProvisionRecord:
     """Run instances for Seeweb cluster."""
+    del cluster_name  # unused
     provider = SeewebNodeProvider(config, cluster_name_on_cloud)
     provider.run_instances(config.node_config, config.count)

sky/provision/vast/instance.py CHANGED Viewed

@@ -44,9 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
     return None
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
+    del cluster_name  # unused
     pending_status = ['CREATED', 'RESTARTING']
     created_instance_ids = []

sky/provision/vsphere/instance.py CHANGED Viewed

@@ -30,9 +30,10 @@ HEAD_NODE_VALUE = '1'
 WORKER_NODE_VALUE = '0'
-def run_instances(region: str, cluster_name: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     logger.info('New provision of Vsphere: run_instances().')
     resumed_instance_ids: List[str] = []
@@ -40,7 +41,7 @@ def run_instances(region: str, cluster_name: str,
     vc_object = _get_vc_object(region)
     vc_object.connect()
-    exist_instances = _get_filtered_instance(vc_object, cluster_name,
+    exist_instances = _get_filtered_instance(vc_object, cluster_name_on_cloud,
                                              config.provider_config)
     head_instance_id = _get_head_instance_id(exist_instances)
@@ -89,8 +90,8 @@ def run_instances(region: str, cluster_name: str,
             config, region, vc_object)
         # TODO: update logic for multi-node creation
         for _ in range(to_start_num):
-            created_instance_uuid = _create_instances(cluster_name, config,
-                                                      region, vc_object,
+            created_instance_uuid = _create_instances(cluster_name_on_cloud,
+                                                      config, region, vc_object,
                                                       vsphere_cluster_name)
             created_instance_ids.append(created_instance_uuid)
         if head_instance_id is None:
@@ -104,7 +105,7 @@ def run_instances(region: str, cluster_name: str,
         provider_name='vsphere',
         region=region,
         zone=vsphere_cluster_name,
-        cluster_name=cluster_name,
+        cluster_name=cluster_name_on_cloud,
         head_instance_id=head_instance_id,
         resumed_instance_ids=resumed_instance_ids,
         created_instance_ids=created_instance_ids,

sky/schemas/api/responses.py CHANGED Viewed

@@ -86,7 +86,7 @@ class StatusResponse(ResponseBaseModel):
     # backends.ResourceHandle, so we use Any here.
     # This is an internally facing field anyway, so it's less
     # of a problem that it's not typed.
-    handle: Any
+    handle: Optional[Any] = None
     last_use: str
     status: status_lib.ClusterStatus
     autostop: int
@@ -118,6 +118,7 @@ class StatusResponse(ResponseBaseModel):
     cpus: Optional[str] = None
     memory: Optional[str] = None
     accelerators: Optional[str] = None
+    cluster_name_on_cloud: Optional[str] = None
 class UploadStatus(enum.Enum):

sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Add last_activity_time and launched_at to cluster history.
+Revision ID: 009
+Revises: 008
+Create Date: 2025-09-24
+"""
+# pylint: disable=invalid-name
+import pickle
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '009'
+down_revision: Union[str, Sequence[str], None] = '008'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add last_activity_time and launched_at columns to cluster history."""
+    with op.get_context().autocommit_block():
+        # Add the columns with indices
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'last_activity_time',
+                                             sa.Integer(),
+                                             server_default=None,
+                                             index=True)
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'launched_at',
+                                             sa.Integer(),
+                                             server_default=None,
+                                             index=True)
+        # Populate the columns for existing rows
+        _populate_cluster_history_columns()
+def _populate_cluster_history_columns():
+    """Populate last_activity_time and launched_at for existing rows using
+    usage_intervals logic."""
+    connection = op.get_bind()
+    # Get all existing rows with usage_intervals
+    result = connection.execute(
+        sa.text('SELECT cluster_hash, usage_intervals FROM cluster_history '
+                'WHERE usage_intervals IS NOT NULL'))
+    for row in result:
+        cluster_hash = row[0]
+        usage_intervals_blob = row[1]
+        try:
+            # Deserialize the usage_intervals
+            usage_intervals = pickle.loads(usage_intervals_blob)
+            if usage_intervals:
+                # Calculate last_activity_time: end time of last interval
+                # or start time if still running
+                last_interval = usage_intervals[-1]
+                last_activity_time = (last_interval[1] if last_interval[1]
+                                      is not None else last_interval[0])
+                # Calculate launched_at: start time of first interval
+                launched_at = usage_intervals[0][0]
+                # Update the row with both calculated values
+                connection.execute(
+                    sa.text('UPDATE cluster_history '
+                            'SET last_activity_time = :last_activity_time, '
+                            'launched_at = :launched_at '
+                            'WHERE cluster_hash = :cluster_hash'), {
+                                'last_activity_time': last_activity_time,
+                                'launched_at': launched_at,
+                                'cluster_hash': cluster_hash
+                            })
+        except (pickle.PickleError, AttributeError, IndexError):
+            # Skip rows with corrupted or invalid usage_intervals
+            continue
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

sky/serve/autoscalers.py CHANGED Viewed

@@ -411,6 +411,8 @@ class _AutoscalerWithHysteresis(Autoscaler):
         # `_set_target_num_replicas_with_hysteresis` to have the replicas
         # quickly scale after each update.
         self.target_num_replicas = self._calculate_target_num_replicas()
+        logger.debug(f'Target number of replicas: {self.target_num_replicas}'
+                     'after update_version.')
         # Cleanup hysteresis counters.
         self.upscale_counter = 0
         self.downscale_counter = 0

sky/serve/client/impl.py CHANGED Viewed

@@ -105,7 +105,8 @@ def update(
 def apply(
-    task: Union['sky.Task', 'sky.Dag'],
+    task: Optional[Union['sky.Task', 'sky.Dag']],
+    workers: Optional[int],
     service_name: str,
     mode: 'serve_utils.UpdateMode',
     pool: bool = False,
@@ -117,35 +118,60 @@ def apply(
     # Avoid circular import.
     from sky.client import sdk  # pylint: disable=import-outside-toplevel
-    dag = dag_utils.convert_entrypoint_to_dag(task)
-    with admin_policy_utils.apply_and_use_config_in_current_request(
-            dag, at_client_side=True) as dag:
-        sdk.validate(dag)
-        request_id = sdk.optimize(dag)
-        sdk.stream_and_get(request_id)
-        if _need_confirmation:
-            noun = 'pool' if pool else 'service'
-            prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
-            if prompt is not None:
-                click.confirm(prompt,
-                              default=True,
-                              abort=True,
-                              show_default=True)
-        dag = client_common.upload_mounts_to_api_server(dag)
-        dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+    noun = 'pool' if pool else 'service'
+    # There are two cases here. If task is None, we should be trying to
+    # update the number of workers in the pool. If task is not None, we should
+    # be trying to apply a new config to the pool. The two code paths
+    # are slightly different with us needing to craft the dag and validate
+    # it if we have a task. In the future we could move this logic to the
+    # server side and simplify this code, for the time being we keep it here.
+    if task is None:
+        if workers is None:
+            raise ValueError(f'Cannot create a new {noun} without specifying '
+                             f'task or workers. Please provide either a task '
+                             f'or specify the number of workers.')
         body = payloads.JobsPoolApplyBody(
-            task=dag_str,
+            workers=workers,
             pool_name=service_name,
             mode=mode,
         )
         response = server_common.make_authenticated_request(
             'POST',
             '/jobs/pool_apply',
             json=json.loads(body.model_dump_json()),
             timeout=(5, None))
         return server_common.get_request_id(response)
+    else:
+        dag = dag_utils.convert_entrypoint_to_dag(task)
+        with admin_policy_utils.apply_and_use_config_in_current_request(
+                dag, at_client_side=True) as dag:
+            sdk.validate(dag)
+            request_id = sdk.optimize(dag)
+            sdk.stream_and_get(request_id)
+            if _need_confirmation:
+                prompt = f'Applying config to {noun} {service_name!r}. Proceed?'
+                if prompt is not None:
+                    click.confirm(prompt,
+                                  default=True,
+                                  abort=True,
+                                  show_default=True)
+            dag = client_common.upload_mounts_to_api_server(dag)
+            dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
+            body = payloads.JobsPoolApplyBody(
+                task=dag_str,
+                pool_name=service_name,
+                mode=mode,
+            )
+            response = server_common.make_authenticated_request(
+                'POST',
+                '/jobs/pool_apply',
+                json=json.loads(body.model_dump_json()),
+                timeout=(5, None))
+            return server_common.get_request_id(response)
 def down(

sky/serve/replica_managers.py CHANGED Viewed

@@ -422,11 +422,12 @@ class ReplicaInfo:
                 based on the cluster name.
         """
         if cluster_record is None:
-            cluster_record = global_user_state.get_cluster_from_name(
+            handle = global_user_state.get_handle_from_cluster_name(
                 self.cluster_name)
-        if cluster_record is None:
+        else:
+            handle = cluster_record['handle']
+        if handle is None:
             return None
-        handle = cluster_record['handle']
         assert isinstance(handle, backends.CloudVmRayResourceHandle)
         return handle
@@ -443,6 +444,12 @@ class ReplicaInfo:
         handle = self.handle()
         if handle is None:
             return None
+        if self.replica_port == '-':
+            # This is a pool replica so there is no endpoint and it's filled
+            # with this dummy value. We return None here so that we can
+            # get the active ready replicas and perform autoscaling. Otherwise,
+            # would error out when trying to get the endpoint.
+            return None
         replica_port_int = int(self.replica_port)
         try:
             endpoint_dict = backend_utils.get_endpoints(handle.cluster_name,
@@ -470,7 +477,7 @@ class ReplicaInfo:
                      with_handle: bool,
                      with_url: bool = True) -> Dict[str, Any]:
         cluster_record = global_user_state.get_cluster_from_name(
-            self.cluster_name)
+            self.cluster_name, include_user_info=False, summary_response=True)
         info_dict = {
             'replica_id': self.replica_id,
             'name': self.cluster_name,
@@ -956,7 +963,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         # provision) or the cluster is preempted and cleaned up by the status
         # refresh. In this case, we skip spawning a new down process to save
         # controller resources.
-        if global_user_state.get_cluster_from_name(info.cluster_name) is None:
+        if not global_user_state.cluster_with_name_exists(info.cluster_name):
             self._handle_sky_down_finish(info, exitcode=0)
             return

skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250922py3-none-any.whl → 1.0.0.dev20250926py3-none-any.whl