PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251005__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251005py3-none-any.whl → 1.0.0.dev20251008py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show

sky/__init__.py +2 -2
sky/authentication.py +17 -21
sky/backends/backend.py +1 -3
sky/backends/cloud_vm_ray_backend.py +8 -20
sky/backends/local_docker_backend.py +0 -5
sky/client/sdk.py +24 -23
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/execution.py +1 -11
sky/global_user_state.py +16 -5
sky/jobs/constants.py +1 -7
sky/jobs/controller.py +9 -1
sky/jobs/scheduler.py +30 -15
sky/jobs/server/core.py +8 -3
sky/jobs/utils.py +30 -2
sky/metrics/utils.py +62 -45
sky/provision/instance_setup.py +32 -10
sky/provision/kubernetes/utils.py +4 -1
sky/provision/provisioner.py +10 -7
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/server/common.py +1 -0
sky/server/config.py +2 -0
sky/server/metrics.py +3 -1
sky/server/requests/executor.py +103 -77
sky/server/requests/requests.py +26 -11
sky/server/server.py +16 -0
sky/skylet/constants.py +9 -1
sky/skylet/events.py +17 -0
sky/skylet/skylet.py +3 -0
sky/templates/kubernetes-ray.yml.j2 +5 -0
sky/utils/context_utils.py +5 -1
sky/utils/controller_utils.py +14 -0
sky/utils/db/db_utils.py +2 -0
sky/utils/db/migration_utils.py +11 -2
sky/volumes/server/server.py +2 -2
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/METADATA +35 -35
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/RECORD +57 -56
/sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{Vg53Kzbf7u4o6fYPeOHMe → MnvNdzHHpiZG1_oKSpbxF}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251005.dist-info → skypilot_nightly-1.0.0.dev20251008.dist-info}/top_level.txt +0 -0

sky/execution.py CHANGED Viewed

@@ -112,7 +112,6 @@ def _execute(
     stages: Optional[List[Stage]] = None,
     cluster_name: Optional[str] = None,
     detach_setup: bool = False,
-    detach_run: bool = False,
     idle_minutes_to_autostop: Optional[int] = None,
     no_setup: bool = False,
     clone_disk_from: Optional[str] = None,
@@ -157,8 +156,6 @@ def _execute(
         job itself. You can safely ctrl-c to detach from logging, and it will
         not interrupt the setup process. To see the logs again after detaching,
         use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
-      detach_run: If True, as soon as a job is submitted, return from this
-        function and do not stream execution logs.
       idle_minutes_to_autostop: int; if provided, the cluster will be set to
         autostop after this many minutes of idleness.
       no_setup: bool; whether to skip setup commands or not when (re-)launching.
@@ -217,7 +214,6 @@ def _execute(
             stages=stages,
             cluster_name=cluster_name,
             detach_setup=detach_setup,
-            detach_run=detach_run,
             no_setup=no_setup,
             clone_disk_from=clone_disk_from,
             skip_unnecessary_provisioning=skip_unnecessary_provisioning,
@@ -239,7 +235,6 @@ def _execute_dag(
     stages: Optional[List[Stage]],
     cluster_name: Optional[str],
     detach_setup: bool,
-    detach_run: bool,
     no_setup: bool,
     clone_disk_from: Optional[str],
     skip_unnecessary_provisioning: bool,
@@ -507,10 +502,7 @@ def _execute_dag(
         if Stage.EXEC in stages:
             try:
                 global_user_state.update_last_use(handle.get_cluster_name())
-                job_id = backend.execute(handle,
-                                         task,
-                                         detach_run,
-                                         dryrun=dryrun)
+                job_id = backend.execute(handle, task, dryrun=dryrun)
             finally:
                 # Enables post_execute() to be run after KeyboardInterrupt.
                 backend.post_execute(handle, down)
@@ -707,7 +699,6 @@ def launch(
         stages=stages,
         cluster_name=cluster_name,
         detach_setup=detach_setup,
-        detach_run=True,
         idle_minutes_to_autostop=idle_minutes_to_autostop,
         no_setup=no_setup,
         clone_disk_from=clone_disk_from,
@@ -802,6 +793,5 @@ def exec(  # pylint: disable=redefined-builtin
             Stage.EXEC,
         ],
         cluster_name=cluster_name,
-        detach_run=True,
         job_logger=job_logger,
     )

sky/global_user_state.py CHANGED Viewed

@@ -2495,11 +2495,22 @@ def _set_cluster_yaml_from_file(cluster_yaml_path: str,
     # on the local file system and migrate it to the database.
     # TODO(syang): remove this check once we have a way to migrate the
     # cluster from file to database. Remove on v0.12.0.
-    if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
-        with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
-            yaml_str = f.read()
-        set_cluster_yaml(cluster_name, yaml_str)
-        return yaml_str
+    if cluster_yaml_path is not None:
+        # First try the exact path
+        path_to_read = None
+        if os.path.exists(cluster_yaml_path):
+            path_to_read = cluster_yaml_path
+        # Fallback: try with .debug suffix (when debug logging was enabled)
+        # Debug logging causes YAML files to be saved with .debug suffix
+        # but the path stored in the handle doesn't include it
+        debug_path = cluster_yaml_path + '.debug'
+        if os.path.exists(debug_path):
+            path_to_read = debug_path
+        if path_to_read is not None:
+            with open(path_to_read, 'r', encoding='utf-8') as f:
+                yaml_str = f.read()
+            set_cluster_yaml(cluster_name, yaml_str)
+            return yaml_str
     return None

sky/jobs/constants.py CHANGED Viewed

@@ -15,16 +15,10 @@ JOB_CONTROLLER_INDICATOR_FILE = '~/.sky/is_jobs_controller'
 CONSOLIDATED_SIGNAL_PATH = os.path.expanduser('~/.sky/signals/')
 SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
 # Resources as a dict for the jobs controller.
-# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
-# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
-# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
-# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
-# parallelism limit, and memory / 350MB is the limit to concurrently running
-# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
 # We use 50 GB disk size to reduce the cost.
 CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
     'cpus': '4+',
-    'memory': '8x',
+    'memory': '4x',
     'disk_size': 50
 }

sky/jobs/controller.py CHANGED Viewed

@@ -1144,7 +1144,15 @@ class Controller:
                 await asyncio.sleep(30)
                 continue
-            if len(running_tasks) >= scheduler.JOBS_PER_WORKER:
+            # Normally, 200 jobs can run on each controller. But if we have a
+            # ton of controllers, we need to limit the number of jobs that can
+            # run on each controller, to achieve a total of 2000 jobs across all
+            # controllers.
+            max_jobs = min(scheduler.MAX_JOBS_PER_WORKER,
+                           (scheduler.MAX_TOTAL_RUNNING_JOBS //
+                            scheduler.get_number_of_controllers()))
+            if len(running_tasks) >= max_jobs:
                 await asyncio.sleep(60)
                 continue

sky/jobs/scheduler.py CHANGED Viewed

@@ -63,7 +63,9 @@ from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.server import config as server_config
 from sky.skylet import constants
+from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils import controller_utils
 from sky.utils import subprocess_utils
 if typing.TYPE_CHECKING:
@@ -91,20 +93,29 @@ JOB_MEMORY_MB = 400
 LAUNCHES_PER_WORKER = 8
 # this can probably be increased to around 300-400 but keeping it lower to just
 # to be safe
-JOBS_PER_WORKER = 200
-# keep 1GB reserved after the controllers
-MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
-CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
+MAX_JOBS_PER_WORKER = 200
+# Maximum number of controllers that can be running. Hard to handle more than
+# 512 launches at once.
+MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
+# Limit the number of jobs that can be running at once on the entire jobs
+# controller cluster. It's hard to handle cancellation of more than 2000 jobs at
+# once.
+# TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
+# hardcoded max limit.
+MAX_TOTAL_RUNNING_JOBS = 2000
 # Maximum values for above constants. There will start to be lagging issues
 # at these numbers already.
 # JOB_MEMORY_MB = 200
 # LAUNCHES_PER_WORKER = 16
 # JOBS_PER_WORKER = 400
+# keep 2GB reserved after the controllers
+MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
+CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
+@annotations.lru_cache(scope='global')
 def get_number_of_controllers() -> int:
     """Returns the number of controllers that should be running.
@@ -123,7 +134,7 @@ def get_number_of_controllers() -> int:
     consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
-    total_memory_mb = common_utils.get_mem_size_gb() * 1024
+    total_memory_mb = controller_utils.get_controller_mem_size_gb() * 1024
     if consolidation_mode:
         config = server_config.compute_server_config(deploy=True, quiet=True)
@@ -136,13 +147,16 @@ def get_number_of_controllers() -> int:
                     config.short_worker_config.burstable_parallelism) * \
             server_config.SHORT_WORKER_MEM_GB * 1024
-        return max(1, int((total_memory_mb - used) // JOB_MEMORY_MB))
+        return min(MAX_CONTROLLERS,
+                   max(1, int((total_memory_mb - used) // JOB_MEMORY_MB)))
     else:
-        return max(
-            1,
-            int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
-                ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) * 1024
-                 + JOB_MEMORY_MB)))
+        return min(
+            MAX_CONTROLLERS,
+            max(
+                1,
+                int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
+                    ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) *
+                     1024 + JOB_MEMORY_MB))))
 def start_controller() -> None:
@@ -280,7 +294,8 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
                                 common_utils.get_user_hash(), priority)
     if state.get_ha_recovery_script(job_id) is None:
         # the run command is just the command that called scheduler
-        run = (f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
+        run = (f'source {env_file_path} && '
+               f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
                f'--job-id {job_id} --env-file {env_file_path} '
                f'--user-yaml-path {original_user_yaml_path} '
                f'--priority {priority}')

sky/jobs/server/core.py CHANGED Viewed

@@ -407,9 +407,12 @@ def launch(
             job_identity = ''
             if job_rank is not None:
                 job_identity = f' (rank: {job_rank})'
-            logger.info(f'{colorama.Fore.YELLOW}'
-                        f'Launching managed job {dag.name!r}{job_identity} '
-                        f'from jobs controller...{colorama.Style.RESET_ALL}')
+            job_controller_postfix = (' from jobs controller' if
+                                      consolidation_mode_job_id is None else '')
+            logger.info(
+                f'{colorama.Fore.YELLOW}'
+                f'Launching managed job {dag.name!r}{job_identity}'
+                f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
             # Launch with the api server's user hash, so that sky status does
             # not show the owner of the controller as whatever user launched
@@ -456,6 +459,8 @@ def launch(
                     managed_job_state.set_ha_recovery_script(
                         consolidation_mode_job_id, run_script)
                     backend.run_on_head(local_handle, run_script)
+                    ux_utils.starting_message(
+                        f'Job submitted, ID: {consolidation_mode_job_id}')
                     return consolidation_mode_job_id, local_handle
     if pool is None:

sky/jobs/utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ import enum
 import logging
 import os
 import pathlib
+import re
 import shlex
 import textwrap
 import time
@@ -299,8 +300,10 @@ async def get_job_status(
                 job_logger.info(f'Job status: {status}')
             job_logger.info('=' * 34)
             return status
-        except (exceptions.CommandError, grpc.RpcError,
-                grpc.FutureTimeoutError) as e:
+        except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
+                ValueError, TypeError) as e:
+            # Note: Each of these exceptions has some additional conditions to
+            # limit how we handle it and whether or not we catch it.
             # Retry on k8s transient network errors. This is useful when using
             # coreweave which may have transient network issue sometimes.
             is_transient_error = False
@@ -319,6 +322,31 @@ async def get_job_status(
                     is_transient_error = True
             elif isinstance(e, grpc.FutureTimeoutError):
                 detailed_reason = 'Timeout'
+            # TODO(cooperc): Gracefully handle these exceptions in the backend.
+            elif isinstance(e, ValueError):
+                # If the cluster yaml is deleted in the middle of getting the
+                # SSH credentials, we could see this. See
+                # sky/global_user_state.py get_cluster_yaml_dict.
+                if re.search(r'Cluster yaml .* not found', str(e)):
+                    detailed_reason = 'Cluster yaml was deleted'
+                else:
+                    raise
+            elif isinstance(e, TypeError):
+                # We will grab the SSH credentials from the cluster yaml, but if
+                # handle.cluster_yaml is None, we will just return an empty dict
+                # for the credentials. See
+                # backend_utils.ssh_credential_from_yaml. Then, the credentials
+                # are passed as kwargs to SSHCommandRunner.__init__ - see
+                # cloud_vm_ray_backend.get_command_runners. So we can hit this
+                # TypeError if the cluster yaml is removed from the handle right
+                # when we pull it before the cluster is fully deleted.
+                error_msg_to_check = (
+                    'SSHCommandRunner.__init__() missing 2 required positional '
+                    'arguments: \'ssh_user\' and \'ssh_private_key\'')
+                if str(e) == error_msg_to_check:
+                    detailed_reason = 'SSH credentials were already cleaned up'
+                else:
+                    raise
             if is_transient_error:
                 logger.info('Failed to connect to the cluster. Retrying '
                             f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')

sky/metrics/utils.py CHANGED Viewed

@@ -11,7 +11,9 @@ from typing import List, Optional, Tuple
 import httpx
 import prometheus_client as prom
+from sky import sky_logging
 from sky.skylet import constants
+from sky.utils import common_utils
 from sky.utils import context_utils
 _SELECT_TIMEOUT = 1
@@ -35,6 +37,8 @@ _MEM_BUCKETS = [
     float('inf'),
 ]
+logger = sky_logging.init_logger(__name__)
 # Whether the metrics are enabled, cannot be changed at runtime.
 METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
                                  'false').lower() == 'true'
@@ -188,53 +192,61 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
     if 'KUBECONFIG' not in env:
         env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
-    # start the port forward process
-    port_forward_process = subprocess.Popen(cmd,
-                                            stdout=subprocess.PIPE,
-                                            stderr=subprocess.STDOUT,
-                                            text=True,
-                                            env=env)
+    port_forward_process = None
+    port_forward_exit = False
     local_port = None
-    start_time = time.time()
-    buffer = ''
-    # wait for the port forward to start and extract the local port
-    while time.time() - start_time < start_port_forward_timeout:
-        if port_forward_process.poll() is not None:
-            # port forward process has terminated
-            if port_forward_process.returncode != 0:
-                raise RuntimeError(
-                    f'Port forward failed for service {service} in namespace '
-                    f'{namespace} on context {context}')
-            break
-        # read output line by line to find the local port
-        if port_forward_process.stdout:
-            # Wait up to 1s for data to be available without blocking
-            r, _, _ = select.select([port_forward_process.stdout], [], [],
-                                    _SELECT_TIMEOUT)
-            if r:
-                # Read available bytes from the FD without blocking
-                fd = port_forward_process.stdout.fileno()
-                raw = os.read(fd, _SELECT_BUFFER_SIZE)
-                chunk = raw.decode(errors='ignore')
-                buffer += chunk
-                match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', buffer)
-                if match:
-                    local_port = int(match.group(1))
-                    break
-        # sleep for 100ms to avoid busy-waiting
-        time.sleep(0.1)
+    try:
+        # start the port forward process
+        port_forward_process = subprocess.Popen(cmd,
+                                                stdout=subprocess.PIPE,
+                                                stderr=subprocess.STDOUT,
+                                                text=True,
+                                                env=env)
+        start_time = time.time()
+        buffer = ''
+        # wait for the port forward to start and extract the local port
+        while time.time() - start_time < start_port_forward_timeout:
+            if port_forward_process.poll() is not None:
+                # port forward process has terminated
+                if port_forward_process.returncode != 0:
+                    port_forward_exit = True
+                break
+            # read output line by line to find the local port
+            if port_forward_process.stdout:
+                # Wait up to 1s for data to be available without blocking
+                r, _, _ = select.select([port_forward_process.stdout], [], [],
+                                        _SELECT_TIMEOUT)
+                if r:
+                    # Read available bytes from the FD without blocking
+                    fd = port_forward_process.stdout.fileno()
+                    raw = os.read(fd, _SELECT_BUFFER_SIZE)
+                    chunk = raw.decode(errors='ignore')
+                    buffer += chunk
+                    match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)',
+                                      buffer)
+                    if match:
+                        local_port = int(match.group(1))
+                        break
+            # sleep for 100ms to avoid busy-waiting
+            time.sleep(0.1)
+    except BaseException:  # pylint: disable=broad-exception-caught
+        if port_forward_process:
+            stop_svc_port_forward(port_forward_process,
+                                  timeout=terminate_port_forward_timeout)
+        raise
+    if port_forward_exit:
+        raise RuntimeError(f'Port forward failed for service {service} in '
+                           f'namespace {namespace} on context {context}')
     if local_port is None:
         try:
-            port_forward_process.terminate()
-            port_forward_process.wait(timeout=terminate_port_forward_timeout)
-        except subprocess.TimeoutExpired:
-            port_forward_process.kill()
-            port_forward_process.wait()
+            if port_forward_process:
+                stop_svc_port_forward(port_forward_process,
+                                      timeout=terminate_port_forward_timeout)
         finally:
             raise RuntimeError(
                 f'Failed to extract local port for service {service} in '
@@ -243,14 +255,15 @@ def start_svc_port_forward(context: str, namespace: str, service: str,
     return port_forward_process, local_port
-def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
+def stop_svc_port_forward(port_forward_process: subprocess.Popen,
+                          timeout: int = 5) -> None:
     """Stops a port forward to a service in a Kubernetes cluster.
     Args:
         port_forward_process: The subprocess.Popen process to terminate
     """
     try:
         port_forward_process.terminate()
-        port_forward_process.wait(timeout=5)
+        port_forward_process.wait(timeout=timeout)
     except subprocess.TimeoutExpired:
         port_forward_process.kill()
         port_forward_process.wait()
@@ -301,6 +314,10 @@ async def send_metrics_request_with_port_forward(
             response.raise_for_status()
             return response.text
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        logger.error(f'Failed to send metrics request with port forward: '
+                     f'{common_utils.format_exception(e)}')
+        raise
     finally:
         # Always clean up port forward
         if port_forward_process:

sky/provision/instance_setup.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 from sky import exceptions
 from sky import logs
 from sky import provision
+from sky import resources as resources_lib
 from sky import sky_logging
 from sky.provision import common
 from sky.provision import docker_utils
@@ -92,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
         f'{usage_constants.USAGE_RUN_ID_FILE}')
-def _set_skypilot_env_var_cmd() -> str:
-    """Sets the skypilot environment variables on the remote machine."""
-    env_vars = env_options.Options.all_options()
-    return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
 def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
     """Decorator that retries the function if it fails.
@@ -482,11 +477,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
 @common.log_function_start_end
 @_auto_retry()
 @timeline.event
-def start_skylet_on_head_node(cluster_name: str,
-                              cluster_info: common.ClusterInfo,
-                              ssh_credentials: Dict[str, Any]) -> None:
+def start_skylet_on_head_node(
+        cluster_name: resources_utils.ClusterName,
+        cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
+        launched_resources: resources_lib.Resources) -> None:
     """Start skylet on the head node."""
-    del cluster_name
+    # Avoid circular import.
+    # pylint: disable=import-outside-toplevel
+    from sky.utils import controller_utils
+    def _set_skypilot_env_var_cmd() -> str:
+        """Sets the skypilot environment variables on the remote machine."""
+        env_vars = {
+            k: str(v) for (k, v) in env_options.Options.all_options().items()
+        }
+        is_controller = controller_utils.Controllers.from_name(
+            cluster_name.display_name) is not None
+        is_kubernetes = cluster_info.provider_name == 'kubernetes'
+        if is_controller and is_kubernetes:
+            # For jobs/serve controller, we pass in the CPU and memory limits
+            # when starting the skylet to handle cases where these env vars
+            # are not set on the cluster's pod spec. The skylet will read
+            # these env vars when starting (ManagedJobEvent.start()) and write
+            # it to disk.
+            resources = launched_resources.assert_launchable()
+            vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
+                resources.instance_type)
+            if vcpus is not None:
+                env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
+            if mem is not None:
+                env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
+        return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
     runners = provision.get_command_runners(cluster_info.provider_name,
                                             cluster_info, **ssh_credentials)
     head_runner = runners[0]

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1688,7 +1688,10 @@ def check_credentials(context: Optional[str],
     try:
         namespace = get_kube_config_context_namespace(context)
         kubernetes.core_api(context).list_namespaced_pod(
-            namespace, _request_timeout=timeout)
+            namespace, limit=1, _request_timeout=timeout)
+        # This call is "free" because this function is a cached call,
+        # and it will not be called again in this function.
+        get_kubernetes_nodes(context=context)
     except ImportError:
         # TODO(romilb): Update these error strs to also include link to docs
         #  when docs are ready.

sky/provision/provisioner.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky import exceptions
 from sky import global_user_state
 from sky import logs
 from sky import provision
+from sky import resources as resources_lib
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import aws
@@ -428,13 +429,14 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
 def _post_provision_setup(
-        cloud_name: str, cluster_name: resources_utils.ClusterName,
-        handle_cluster_yaml: str,
+        launched_resources: resources_lib.Resources,
+        cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
         provision_record: provision_common.ProvisionRecord,
         custom_resource: Optional[str]) -> provision_common.ClusterInfo:
     config_from_yaml = global_user_state.get_cluster_yaml_dict(
         handle_cluster_yaml)
     provider_config = config_from_yaml.get('provider')
+    cloud_name = repr(launched_resources.cloud)
     cluster_info = provision.get_cluster_info(cloud_name,
                                               provision_record.region,
                                               cluster_name.name_on_cloud,
@@ -694,8 +696,9 @@ def _post_provision_setup(
                                                     cluster_info,
                                                     ssh_credentials)
-        instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
-                                                 cluster_info, ssh_credentials)
+        instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
+                                                 ssh_credentials,
+                                                 launched_resources)
     logger.info(
         ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
@@ -706,8 +709,8 @@ def _post_provision_setup(
 @timeline.event
 def post_provision_runtime_setup(
-        cloud_name: str, cluster_name: resources_utils.ClusterName,
-        handle_cluster_yaml: str,
+        launched_resources: resources_lib.Resources,
+        cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
         provision_record: provision_common.ProvisionRecord,
         custom_resource: Optional[str],
         log_dir: str) -> provision_common.ClusterInfo:
@@ -728,7 +731,7 @@ def post_provision_runtime_setup(
         try:
             logger.debug(_TITLE.format('System Setup After Provision'))
             return _post_provision_setup(
-                cloud_name,
+                launched_resources,
                 cluster_name,
                 handle_cluster_yaml=handle_cluster_yaml,
                 provision_record=provision_record,

sky/schemas/db/global_user_state/010_save_ssh_key.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Add ssh keys in filesystem to global user state.
+Revision ID: 010
+Revises: 009
+Create Date: 2025-10-07
+"""
+import glob
+# pylint: disable=invalid-name
+import os
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision: str = '010'
+down_revision: Union[str, Sequence[str], None] = '009'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add last_activity_time and launched_at columns to cluster history."""
+    connection = op.get_bind()
+    match_dirs = glob.glob(os.path.expanduser('~/.sky/clients/*/ssh'))
+    file_user_hashes = set()
+    for match_dir in match_dirs:
+        user_hash = match_dir.split('/')[-2]
+        file_user_hashes.add(user_hash)
+    # Get all existing ssh keys
+    existing_user_hashes = set()
+    result = connection.execute(sa.text('SELECT user_hash FROM ssh_key'))
+    for row in result:
+        existing_user_hashes.add(row[0])
+    user_hashes_to_add = file_user_hashes - existing_user_hashes
+    for user_hash in user_hashes_to_add:
+        match_dir = os.path.join(os.path.expanduser('~/.sky/clients'),
+                                 user_hash, 'ssh')
+        public_key_path = os.path.join(match_dir, 'sky-key.pub')
+        private_key_path = os.path.join(match_dir, 'sky-key')
+        try:
+            with open(public_key_path, 'r', encoding='utf-8') as f:
+                public_key = f.read().strip()
+            with open(private_key_path, 'r', encoding='utf-8') as f:
+                private_key = f.read().strip()
+        except FileNotFoundError:
+            # Skip if the key files are not found
+            continue
+        connection.execute(
+            sa.text('INSERT INTO ssh_key '
+                    '(user_hash, ssh_public_key, ssh_private_key) '
+                    'VALUES (:user_hash, :ssh_public_key, :ssh_private_key) '
+                    'ON CONFLICT DO NOTHING'), {
+                        'user_hash': user_hash,
+                        'ssh_public_key': public_key,
+                        'ssh_private_key': private_key
+                    })
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

sky/server/common.py CHANGED Viewed

@@ -950,6 +950,7 @@ def clear_local_api_server_database() -> None:
     db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
     for extension in ['', '-shm', '-wal']:
         try:
+            logger.debug(f'Removing database file {db_path}{extension}')
             os.remove(f'{db_path}{extension}')
         except FileNotFoundError:
             logger.debug(f'Database file {db_path}{extension} not found.')

sky/server/config.py CHANGED Viewed

@@ -111,7 +111,9 @@ def compute_server_config(deploy: bool,
     process after API server was introduced.
     """
     cpu_count = common_utils.get_cpu_count()
+    logger.debug(f'CPU count: {cpu_count}')
     mem_size_gb = common_utils.get_mem_size_gb()
+    logger.debug(f'Memory size: {mem_size_gb}GB')
     max_parallel_for_long = _max_long_worker_parallism(cpu_count,
                                                        mem_size_gb,
                                                        local=not deploy)

skypilot-nightly 1.0.0.dev20251005__py3-none-any.whl → 1.0.0.dev20251008__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251005py3-none-any.whl → 1.0.0.dev20251008py3-none-any.whl