PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250219__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250219py3-none-any.whl → 1.0.0.dev20250221py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sky/__init__.py +4 -2
sky/adaptors/nebius.py +85 -0
sky/backends/backend_utils.py +8 -0
sky/backends/cloud_vm_ray_backend.py +10 -2
sky/client/sdk.py +8 -3
sky/clouds/__init__.py +2 -0
sky/clouds/nebius.py +294 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/jobs/controller.py +17 -0
sky/jobs/server/core.py +31 -3
sky/provision/__init__.py +1 -0
sky/provision/kubernetes/instance.py +5 -1
sky/provision/kubernetes/utils.py +8 -7
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +310 -0
sky/server/common.py +5 -7
sky/server/requests/executor.py +94 -87
sky/server/server.py +10 -5
sky/server/stream_utils.py +8 -11
sky/setup_files/dependencies.py +9 -1
sky/skylet/constants.py +3 -6
sky/task.py +6 -0
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/nebius-ray.yml.j2 +79 -0
sky/utils/common_utils.py +38 -0
sky/utils/controller_utils.py +66 -2
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/METADATA +8 -4
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/RECORD +35 -27
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250219.dist-info → skypilot_nightly-1.0.0.dev20250221.dist-info}/top_level.txt +0 -0

sky/provision/nebius/utils.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Nebius library wrapper for SkyPilot."""
+import time
+from typing import Any, Dict
+import uuid
+from sky import sky_logging
+from sky.adaptors import nebius
+from sky.utils import common_utils
+logger = sky_logging.init_logger(__name__)
+POLL_INTERVAL = 5
+def retry(func):
+    """Decorator to retry a function."""
+    def wrapper(*args, **kwargs):
+        """Wrapper for retrying a function."""
+        cnt = 0
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except nebius.nebius.error.QueryError as e:
+                if cnt >= 3:
+                    raise
+                logger.warning('Retrying for exception: '
+                               f'{common_utils.format_exception(e)}.')
+                time.sleep(POLL_INTERVAL)
+    return wrapper
+def get_project_by_region(region: str) -> str:
+    service = nebius.iam().ProjectServiceClient(nebius.sdk())
+    projects = service.list(nebius.iam().ListProjectsRequest(
+        parent_id=nebius.get_tenant_id())).wait()
+    # To find a project in a specific region, we rely on the project ID to
+    # deduce the region, since there is currently no method to retrieve region
+    # information directly from the project. Additionally, there is only one
+    # project per region, and projects cannot be created at this time.
+    # The region is determined from the project ID using a region-specific
+    # identifier embedded in it.
+    # Project id looks like project-e00xxxxxxxxxxxxxx where
+    # e00 - id of region 'eu-north1'
+    # e01 - id of region 'eu-west1'
+    # TODO(SalikovAlex): fix when info about region will be in projects list
+    # Currently, Nebius cloud supports 2 regions. We manually enumerate
+    # them here. Reference: https://docs.nebius.com/overview/regions
+    for project in projects.items:
+        if region == 'eu-north1' and project.metadata.id[8:11] == 'e00':
+            return project.metadata.id
+        if region == 'eu-west1' and project.metadata.id[8:11] == 'e01':
+            return project.metadata.id
+    raise Exception(f'No project found for region "{region}".')
+def get_or_create_gpu_cluster(name: str, region: str) -> str:
+    """Creates a GPU cluster.
+    When creating a GPU cluster, select an InfiniBand fabric for it:
+    fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
+    fabric-5 for projects in the eu-west1 region.
+    https://docs.nebius.com/compute/clusters/gpu
+    """
+    project_id = get_project_by_region(region)
+    service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
+    try:
+        cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=name,
+        )).wait()
+        cluster_id = cluster.metadata.id
+    except nebius.request_error() as no_cluster_found_error:
+        if region == 'eu-north1':
+            fabric = 'fabric-4'
+        elif region == 'eu-west1':
+            fabric = 'fabric-5'
+        else:
+            raise RuntimeError(
+                f'Unsupported region {region}.') from no_cluster_found_error
+        cluster = service.create(nebius.compute().CreateGpuClusterRequest(
+            metadata=nebius.nebius_common().ResourceMetadata(
+                parent_id=project_id,
+                name=name,
+            ),
+            spec=nebius.compute().GpuClusterSpec(
+                infiniband_fabric=fabric))).wait()
+        cluster_id = cluster.resource_id
+    return cluster_id
+def delete_cluster(name: str, region: str) -> None:
+    """Delete a GPU cluster."""
+    project_id = get_project_by_region(region)
+    service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
+    try:
+        cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=name,
+        )).wait()
+        cluster_id = cluster.metadata.id
+        logger.debug(f'Found GPU Cluster : {cluster_id}.')
+        service.delete(
+            nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
+        logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
+    except nebius.request_error():
+        logger.debug('GPU Cluster does not exist.')
+def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
+    """Lists instances associated with API key."""
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    result = service.list(
+        nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
+    instances = result
+    instance_dict: Dict[str, Dict[str, Any]] = {}
+    for instance in instances.items:
+        info = {}
+        info['status'] = instance.status.state.name
+        info['name'] = instance.metadata.name
+        if instance.status.network_interfaces:
+            info['external_ip'] = instance.status.network_interfaces[
+                0].public_ip_address.address.split('/')[0]
+            info['internal_ip'] = instance.status.network_interfaces[
+                0].ip_address.address.split('/')[0]
+        instance_dict[instance.metadata.id] = info
+    return instance_dict
+def stop(instance_id: str) -> None:
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get(nebius.compute().GetInstanceRequest(
+            id=instance_id,)).wait()
+        if instance.status.state.name == 'STOPPED':
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_id} stopping.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_STOP:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_STOP * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_id}'
+            f' to be stopped.')
+def start(instance_id: str) -> None:
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get(nebius.compute().GetInstanceRequest(
+            id=instance_id,)).wait()
+        if instance.status.state.name == 'RUNNING':
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_id} starting.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_START:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_START * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_id}'
+            f' to be ready.')
+def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
+           preset: str, region: str, image_family: str, disk_size: int,
+           user_data: str) -> str:
+    # Each node must have a unique name to avoid conflicts between
+    # multiple worker VMs. To ensure uniqueness,a UUID is appended
+    # to the node name.
+    instance_name = (f'{cluster_name_on_cloud}-'
+                     f'{uuid.uuid4().hex[:4]}-{node_type}')
+    logger.debug(f'Launching instance: {instance_name}')
+    disk_name = 'disk-' + instance_name
+    cluster_id = None
+    # 8 GPU virtual machines can be grouped into a GPU cluster.
+    # The GPU clusters are built with InfiniBand secure high-speed networking.
+    # https://docs.nebius.com/compute/clusters/gpu
+    if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
+        if preset == '8gpu-128vcpu-1600gb':
+            cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
+                                                   region)
+    project_id = get_project_by_region(region)
+    service = nebius.compute().DiskServiceClient(nebius.sdk())
+    disk = service.create(nebius.compute().CreateDiskRequest(
+        metadata=nebius.nebius_common().ResourceMetadata(
+            parent_id=project_id,
+            name=disk_name,
+        ),
+        spec=nebius.compute().DiskSpec(
+            source_image_family=nebius.compute().SourceImageFamily(
+                image_family=image_family),
+            size_gibibytes=disk_size,
+            type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
+        ))).wait()
+    disk_id = disk.resource_id
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
+        disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=disk_name,
+        )).wait()
+        if disk.status.state.name == 'READY':
+            break
+        logger.debug(f'Waiting for disk {disk_name} to be ready.')
+        time.sleep(POLL_INTERVAL)
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_DISK_CREATE:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_DISK_CREATE * POLL_INTERVAL}'
+            f' seconds) while waiting for disk {disk_name}'
+            f' to be ready.')
+    service = nebius.vpc().SubnetServiceClient(nebius.sdk())
+    sub_net = service.list(nebius.vpc().ListSubnetsRequest(
+        parent_id=project_id,)).wait()
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    service.create(nebius.compute().CreateInstanceRequest(
+        metadata=nebius.nebius_common().ResourceMetadata(
+            parent_id=project_id,
+            name=instance_name,
+        ),
+        spec=nebius.compute().InstanceSpec(
+            gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
+            if cluster_id is not None else None,
+            boot_disk=nebius.compute().AttachedDiskSpec(
+                attach_mode=nebius.compute(
+                ).AttachedDiskSpec.AttachMode.READ_WRITE,
+                existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
+            cloud_init_user_data=user_data,
+            resources=nebius.compute().ResourcesSpec(platform=platform,
+                                                     preset=preset),
+            network_interfaces=[
+                nebius.compute().NetworkInterfaceSpec(
+                    subnet_id=sub_net.items[0].metadata.id,
+                    ip_address=nebius.compute().IPAddress(),
+                    name='network-interface-0',
+                    public_ip_address=nebius.compute().PublicIPAddress())
+            ]))).wait()
+    instance_id = ''
+    retry_count = 0
+    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
+        service = nebius.compute().InstanceServiceClient(nebius.sdk())
+        instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
+            parent_id=project_id,
+            name=instance_name,
+        )).wait()
+        if instance.status.state.name == 'STARTING':
+            instance_id = instance.metadata.id
+            break
+        time.sleep(POLL_INTERVAL)
+        logger.debug(f'Waiting for instance {instance_name} start running.')
+        retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
+            f' seconds) while waiting for instance {instance_name}'
+            f' to be ready.')
+    return instance_id
+def remove(instance_id: str) -> None:
+    """Terminates the given instance."""
+    service = nebius.compute().InstanceServiceClient(nebius.sdk())
+    result = service.get(
+        nebius.compute().GetInstanceRequest(id=instance_id)).wait()
+    disk_id = result.spec.boot_disk.existing_disk.id
+    service.delete(
+        nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
+    retry_count = 0
+    # The instance begins deleting and attempts to delete the disk.
+    # Must wait until the disk is unlocked and becomes deletable.
+    while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
+        try:
+            service = nebius.compute().DiskServiceClient(nebius.sdk())
+            service.delete(
+                nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
+            break
+        except nebius.request_error():
+            logger.debug('Waiting for disk deletion.')
+            time.sleep(POLL_INTERVAL)
+            retry_count += 1
+    if retry_count == nebius.MAX_RETRIES_TO_DISK_DELETE:
+        raise TimeoutError(
+            f'Exceeded maximum retries '
+            f'({nebius.MAX_RETRIES_TO_DISK_DELETE * POLL_INTERVAL}'
+            f' seconds) while waiting for disk {disk_id}'
+            f' to be deleted.')

sky/server/common.py CHANGED Viewed

@@ -15,7 +15,6 @@ import uuid
 import colorama
 import filelock
-import psutil
 import pydantic
 import requests
@@ -146,13 +145,14 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
     return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
-def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
+def start_api_server_in_background(deploy: bool = False,
+                                   host: str = '127.0.0.1'):
     if not is_api_server_local():
         raise RuntimeError(
             f'Cannot start API server: {get_server_url()} is not a local URL')
     # Check available memory before starting the server.
-    avail_mem_size_gb: float = psutil.virtual_memory().available / (1024**3)
+    avail_mem_size_gb: float = common_utils.get_mem_size_gb()
     if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
         logger.warning(
             f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only has '
@@ -163,8 +163,6 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
     log_path = os.path.expanduser(constants.API_SERVER_LOGS)
     os.makedirs(os.path.dirname(log_path), exist_ok=True)
-    # The command to run uvicorn. Adjust the app:app to your application's
-    # location.
     api_server_cmd = API_SERVER_CMD
     if deploy:
         api_server_cmd += ' --deploy'
@@ -172,7 +170,7 @@ def start_uvicorn_in_background(deploy: bool = False, host: str = '127.0.0.1'):
         api_server_cmd += f' --host {host}'
     cmd = f'{sys.executable} {api_server_cmd} > {log_path} 2>&1'
-    # Start the uvicorn process in the background and don't wait for it.
+    # Start the API server process in the background and don't wait for it.
     # If this is called from a CLI invocation, we need start_new_session=True so
     # that SIGINT on the CLI will not also kill the API server.
     subprocess.Popen(cmd, shell=True, start_new_session=True)
@@ -232,7 +230,7 @@ def _start_api_server(deploy: bool = False, host: str = '127.0.0.1'):
                     f'SkyPilot API server at {server_url}. '
                     'Starting a local server.'
                     f'{colorama.Style.RESET_ALL}')
-        start_uvicorn_in_background(deploy=deploy, host=host)
+        start_api_server_in_background(deploy=deploy, host=host)
         logger.info(ux_utils.finishing_message('SkyPilot API server started.'))

sky/server/requests/executor.py CHANGED Viewed

@@ -32,7 +32,6 @@ import traceback
 import typing
 from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
-import psutil
 import setproctitle
 from sky import global_user_state
@@ -70,18 +69,36 @@ logger = sky_logging.init_logger(__name__)
 # platforms, including macOS.
 multiprocessing.set_start_method('spawn', force=True)
-# Constants based on profiling the peak memory usage of
-# various sky commands. See `tests/load_test/` for details.
-# Max memory consumption for each request.
-_PER_BLOCKING_REQUEST_MEM_GB = 0.25
-_PER_NON_BLOCKING_REQUEST_MEM_GB = 0.15
-# To control the number of blocking workers.
-_CPU_MULTIPLIER_FOR_BLOCKING_WORKERS = 2
-_MAX_BLOCKING_WORKERS_LOCAL = 4
-# Percentage of memory for blocking requests
+# Constants based on profiling the peak memory usage while serving various
+# sky commands. These estimation are highly related to usage patterns
+# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
+# the profiling covers major clouds and common usage patterns. For user has
+# deviated usage pattern, they can override the default estimation by
+# environment variables.
+# NOTE(dev): update these constants for each release according to the load
+# test results.
+# TODO(aylei): maintaining these constants is error-prone, we may need to
+# automatically tune parallelism at runtime according to system usage stats
+# in the future.
+_LONG_WORKER_MEM_GB = 0.4
+_SHORT_WORKER_MEM_GB = 0.25
+# To control the number of long workers.
+_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
+# Limit the number of long workers of local API server, since local server is
+# typically:
+# 1. launched automatically in an environment with high resource contention
+#    (e.g. Laptop)
+# 2. used by a single user
+_MAX_LONG_WORKERS_LOCAL = 4
+# Percentage of memory for long requests
 # from the memory reserved for SkyPilot.
-# This is to reserve some memory for non-blocking requests.
+# This is to reserve some memory for short requests.
 _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
+# Minimal number of long workers to ensure responsiveness.
+_MIN_LONG_WORKERS = 1
+# Minimal number of short workers, there is a daemon task running on short
+# workers so at least 2 workers are needed to ensure responsiveness.
+_MIN_SHORT_WORKERS = 2
 class QueueBackend(enum.Enum):
@@ -301,34 +318,32 @@ def schedule_request(request_id: str,
     _get_queue(schedule_type).put(input_tuple)
+def executor_initializer(proc_group: str):
+    setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
+                              f'{multiprocessing.current_process().pid}')
 def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
     """Worker for the requests.
     Args:
         max_parallel_size: Maximum number of parallel jobs this worker can run.
     """
-    logger.info(f'Starting {worker} with pid '
-                f'{multiprocessing.current_process().pid}')
-    setproctitle.setproctitle(
-        f'SkyPilot:worker:{worker.schedule_type.value}-{worker.id}')
+    proc_group = f'{worker.schedule_type.value}-{worker.id}'
+    setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
     queue = _get_queue(worker.schedule_type)
-    # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
-    # because the former is more efficient with the support of lazy creation of
-    # worker processes.
-    # We use executor instead of individual multiprocessing.Process to avoid
-    # the overhead of forking a new process for each request, which can be about
-    # 1s delay.
-    with concurrent.futures.ProcessPoolExecutor(
-            max_workers=max_parallel_size) as executor:
-        while True:
+    def process_request(executor: concurrent.futures.ProcessPoolExecutor):
+        try:
             request_element = queue.get()
             if request_element is None:
                 time.sleep(0.1)
-                continue
+                return
             request_id, ignore_return_value = request_element
             request = api_requests.get_request(request_id)
+            assert request is not None, f'Request with ID {request_id} is None'
             if request.status == api_requests.RequestStatus.CANCELLED:
-                continue
+                return
             logger.info(f'[{worker}] Submitting request: {request_id}')
             # Start additional process to run the request, so that it can be
             # cancelled when requested by a user.
@@ -347,60 +362,49 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
                 logger.info(f'[{worker}] Finished request: {request_id}')
             else:
                 logger.info(f'[{worker}] Submitted request: {request_id}')
+        except KeyboardInterrupt:
+            # Interrupt the worker process will stop request execution, but
+            # the SIGTERM request should be respected anyway since it might
+            # be explicitly sent by user.
+            # TODO(aylei): crash the API server or recreate the worker process
+            # to avoid broken state.
+            logger.error(f'[{worker}] Worker process interrupted')
+            raise
+        except (Exception, SystemExit) as e:  # pylint: disable=broad-except
+            # Catch any other exceptions to avoid crashing the worker process.
+            logger.error(
+                f'[{worker}] Error processing request {request_id}: '
+                f'{common_utils.format_exception(e, use_bracket=True)}')
-def _get_cpu_count() -> int:
-    """Get the number of CPUs.
-    If the API server is deployed as a pod in k8s cluster, we assume the
-    number of CPUs is provided by the downward API.
-    """
-    cpu_count = os.getenv('SKYPILOT_POD_CPU_CORE_LIMIT')
-    if cpu_count is not None:
-        try:
-            return int(float(cpu_count))
-        except ValueError as e:
-            with ux_utils.print_exception_no_traceback():
-                raise ValueError(
-                    f'Failed to parse the number of CPUs from {cpu_count}'
-                ) from e
-    return psutil.cpu_count()
-def _get_mem_size_gb() -> float:
-    """Get the memory size in GB.
-    If the API server is deployed as a pod in k8s cluster, we assume the
-    memory size is provided by the downward API.
-    """
-    mem_size = os.getenv('SKYPILOT_POD_MEMORY_GB_LIMIT')
-    if mem_size is not None:
-        try:
-            return float(mem_size)
-        except ValueError as e:
-            with ux_utils.print_exception_no_traceback():
-                raise ValueError(
-                    f'Failed to parse the memory size from {mem_size}') from e
-    return psutil.virtual_memory().total / (1024**3)
+    # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
+    # because the former is more efficient with the support of lazy creation of
+    # worker processes.
+    # We use executor instead of individual multiprocessing.Process to avoid
+    # the overhead of forking a new process for each request, which can be about
+    # 1s delay.
+    with concurrent.futures.ProcessPoolExecutor(
+            max_workers=max_parallel_size,
+            initializer=executor_initializer,
+            initargs=(proc_group,)) as executor:
+        while True:
+            process_request(executor)
 def start(deploy: bool) -> List[multiprocessing.Process]:
     """Start the request workers."""
     # Determine the job capacity of the workers based on the system resources.
-    cpu_count = _get_cpu_count()
-    mem_size_gb = _get_mem_size_gb()
+    cpu_count = common_utils.get_cpu_count()
+    mem_size_gb = common_utils.get_mem_size_gb()
     mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
-    parallel_for_blocking = _max_parallel_size_for_blocking(
-        cpu_count, mem_size_gb)
-    if not deploy:
-        parallel_for_blocking = min(parallel_for_blocking,
-                                    _MAX_BLOCKING_WORKERS_LOCAL)
-    max_parallel_for_non_blocking = _max_parallel_size_for_non_blocking(
-        mem_size_gb, parallel_for_blocking)
+    max_parallel_for_long = _max_long_worker_parallism(cpu_count,
+                                                       mem_size_gb,
+                                                       local=not deploy)
+    max_parallel_for_short = _max_short_worker_parallism(
+        mem_size_gb, max_parallel_for_long)
     logger.info(
-        f'SkyPilot API server will start {parallel_for_blocking} workers for '
-        f'blocking requests and will allow at max '
-        f'{max_parallel_for_non_blocking} non-blocking requests in parallel.')
+        f'SkyPilot API server will start {max_parallel_for_long} workers for '
+        f'long requests and will allow at max '
+        f'{max_parallel_for_short} short requests in parallel.')
     # Setup the queues.
     if queue_backend == QueueBackend.MULTIPROCESSING:
@@ -424,7 +428,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
     logger.info('Request queues created')
     worker_procs = []
-    for worker_id in range(parallel_for_blocking):
+    for worker_id in range(max_parallel_for_long):
         worker = RequestWorker(id=worker_id,
                                schedule_type=api_requests.ScheduleType.LONG)
         worker_proc = multiprocessing.Process(target=request_worker,
@@ -432,31 +436,34 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
         worker_proc.start()
         worker_procs.append(worker_proc)
-    # Start a non-blocking worker.
+    # Start a worker for short requests.
     worker = RequestWorker(id=1, schedule_type=api_requests.ScheduleType.SHORT)
     worker_proc = multiprocessing.Process(target=request_worker,
-                                          args=(worker,
-                                                max_parallel_for_non_blocking))
+                                          args=(worker, max_parallel_for_short))
     worker_proc.start()
     worker_procs.append(worker_proc)
     return worker_procs
 @annotations.lru_cache(scope='global', maxsize=1)
-def _max_parallel_size_for_blocking(cpu_count: int, mem_size_gb: float) -> int:
-    """Max parallelism for blocking requests."""
-    cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_BLOCKING_WORKERS
+def _max_long_worker_parallism(cpu_count: int,
+                               mem_size_gb: float,
+                               local=False) -> int:
+    """Max parallelism for long workers."""
+    cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
     mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
-                                 _PER_BLOCKING_REQUEST_MEM_GB)
-    n = max(1, min(cpu_based_max_parallel, mem_based_max_parallel))
+                                 _LONG_WORKER_MEM_GB)
+    n = max(_MIN_LONG_WORKERS,
+            min(cpu_based_max_parallel, mem_based_max_parallel))
+    if local:
+        return min(n, _MAX_LONG_WORKERS_LOCAL)
     return n
 @annotations.lru_cache(scope='global', maxsize=1)
-def _max_parallel_size_for_non_blocking(mem_size_gb: float,
-                                        parallel_size_for_blocking: int) -> int:
-    """Max parallelism for non-blocking requests."""
-    available_mem = mem_size_gb - (parallel_size_for_blocking *
-                                   _PER_BLOCKING_REQUEST_MEM_GB)
-    n = max(1, int(available_mem / _PER_NON_BLOCKING_REQUEST_MEM_GB))
+def _max_short_worker_parallism(mem_size_gb: float,
+                                long_worker_parallism: int) -> int:
+    """Max parallelism for short workers."""
+    available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
+    n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
     return n

sky/server/server.py CHANGED Viewed

@@ -57,7 +57,9 @@ P = ParamSpec('P')
 def _add_timestamp_prefix_for_server_logs() -> None:
     server_logger = sky_logging.init_logger('sky.server')
-    # Disable propagation to avoid the root logger of SkyPilot being affected.
+    # Clear existing handlers first to prevent duplicates
+    server_logger.handlers.clear()
+    # Disable propagation to avoid the root logger of SkyPilot being affected
     server_logger.propagate = False
     # Add date prefix to the log message printed by loggers under
     # server.
@@ -460,6 +462,7 @@ async def launch(launch_body: payloads.LaunchBody,
                  request: fastapi.Request) -> None:
     """Launches a cluster or task."""
     request_id = request.state.request_id
+    logger.info(f'Launching request: {request_id}')
     executor.schedule_request(
         request_id,
         request_name='launch',
@@ -627,6 +630,9 @@ async def logs(
         request_name='logs',
         request_body=cluster_job_body,
         func=core.tail_logs,
+        # TODO(aylei): We have tail logs scheduled as SHORT request, because it
+        # should be responsive. However, it can be long running if the user's
+        # job keeps running, and we should avoid it taking the SHORT worker.
         schedule_type=requests_lib.ScheduleType.SHORT,
         request_cluster_name=cluster_job_body.cluster_name,
     )
@@ -794,10 +800,9 @@ async def api_get(request_id: str) -> requests_lib.RequestPayload:
                                             detail=dataclasses.asdict(
                                                 request_task.encode()))
             return request_task.encode()
-        # Sleep 0 to yield, so other coroutines can run. This busy waiting
-        # loop is performance critical for short-running requests, so we do
-        # not want to yield too long.
-        await asyncio.sleep(0)
+        # yield control to allow other coroutines to run, sleep shortly
+        # to avoid storming the DB and CPU in the meantime
+        await asyncio.sleep(0.1)
 @app.get('/api/stream')

skypilot-nightly 1.0.0.dev20250219__py3-none-any.whl → 1.0.0.dev20250221__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250219py3-none-any.whl → 1.0.0.dev20250221py3-none-any.whl