PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sky/__init__.py +2 -2
sky/adaptors/nebius.py +28 -40
sky/backends/backend_utils.py +19 -2
sky/backends/cloud_vm_ray_backend.py +33 -8
sky/backends/local_docker_backend.py +1 -2
sky/cli.py +91 -38
sky/client/cli.py +91 -38
sky/client/sdk.py +3 -2
sky/clouds/aws.py +12 -6
sky/clouds/azure.py +3 -0
sky/clouds/cloud.py +8 -2
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +3 -0
sky/clouds/fluidstack.py +3 -0
sky/clouds/gcp.py +7 -0
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +42 -19
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/nebius.py +18 -10
sky/clouds/oci.py +6 -3
sky/clouds/paperspace.py +2 -0
sky/clouds/runpod.py +2 -0
sky/clouds/scp.py +2 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
sky/clouds/vast.py +2 -0
sky/clouds/vsphere.py +2 -0
sky/core.py +58 -29
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/favicon.ico +0 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/exceptions.py +6 -0
sky/execution.py +19 -4
sky/global_user_state.py +1 -0
sky/optimizer.py +35 -11
sky/provision/common.py +2 -5
sky/provision/docker_utils.py +22 -16
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +276 -93
sky/provision/kubernetes/network.py +1 -1
sky/provision/kubernetes/utils.py +36 -24
sky/provision/provisioner.py +6 -0
sky/serve/replica_managers.py +51 -5
sky/serve/serve_state.py +41 -0
sky/serve/service.py +108 -63
sky/server/common.py +6 -3
sky/server/config.py +184 -0
sky/server/requests/executor.py +17 -156
sky/server/server.py +4 -4
sky/setup_files/dependencies.py +0 -1
sky/skylet/constants.py +7 -0
sky/skypilot_config.py +27 -6
sky/task.py +1 -1
sky/templates/kubernetes-ray.yml.j2 +145 -15
sky/templates/nebius-ray.yml.j2 +63 -0
sky/utils/command_runner.py +17 -3
sky/utils/command_runner.pyi +2 -0
sky/utils/controller_utils.py +24 -0
sky/utils/kubernetes/rsync_helper.sh +20 -4
sky/utils/schemas.py +13 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0

sky/server/config.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""SkyPilot API Server configuration."""
+import dataclasses
+import enum
+from sky import sky_logging
+from sky.server import constants as server_constants
+from sky.utils import common_utils
+# Constants based on profiling the peak memory usage while serving various
+# sky commands. These estimation are highly related to usage patterns
+# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
+# the profiling covers major clouds and common usage patterns. For user has
+# deviated usage pattern, they can override the default estimation by
+# environment variables.
+# NOTE(dev): update these constants for each release according to the load
+# test results.
+# TODO(aylei): maintaining these constants is error-prone, we may need to
+# automatically tune parallelism at runtime according to system usage stats
+# in the future.
+_LONG_WORKER_MEM_GB = 0.4
+_SHORT_WORKER_MEM_GB = 0.25
+# To control the number of long workers.
+_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
+# Limit the number of long workers of local API server, since local server is
+# typically:
+# 1. launched automatically in an environment with high resource contention
+#    (e.g. Laptop)
+# 2. used by a single user
+_MAX_LONG_WORKERS_LOCAL = 4
+# Percentage of memory for long requests
+# from the memory reserved for SkyPilot.
+# This is to reserve some memory for short requests.
+_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
+# Minimal number of long workers to ensure responsiveness.
+_MIN_LONG_WORKERS = 1
+# Minimal number of short workers, there is a daemon task running on short
+# workers so at least 2 workers are needed to ensure responsiveness.
+_MIN_SHORT_WORKERS = 2
+# Default number of burstable workers for local API server. A heuristic number
+# that is large enough for most local cases.
+# TODO(aylei): the number of burstable workers should be auto-tuned based on the
+# system usage stats.
+_BURSTABLE_WORKERS_FOR_LOCAL = 1024
+logger = sky_logging.init_logger(__name__)
+class QueueBackend(enum.Enum):
+    # Local queue backend serves queues in each process locally, which has
+    # lower resource usage but the consumer must be in the same process, i.e.
+    # this only works in single-process mode.
+    LOCAL = 'local'
+    # Multi-process queue backend starts a dedicated process for serving queues.
+    MULTIPROCESSING = 'multiprocessing'
+    # TODO(zhwu): we can add redis backend in the future.
+@dataclasses.dataclass
+class WorkerConfig:
+    garanteed_parallelism: int
+    burstable_parallelism: int
+@dataclasses.dataclass
+class ServerConfig:
+    num_server_workers: int
+    long_worker_config: WorkerConfig
+    short_worker_config: WorkerConfig
+    queue_backend: QueueBackend
+def compute_server_config(deploy: bool) -> ServerConfig:
+    """Compute the server config based on environment.
+    We have different assumptions for the resources in different deployment
+    modes, which leads to different worker setups:
+    - Deployment mode (deploy=True), we assume the resources are dedicated to
+      the API server and the resources will be tuned for serious use cases, so:
+      - Use multiprocessing queue backend and dedicated workers processes to
+        avoid GIL contention.
+      - Parallelism (number of executor processes) is fixed and executor
+        processes have same lifecycle with the server, which ensures
+        best-effort cache reusing and stable resources consumption.
+      - Reject to start in low resource environments, to avoid flaky
+        deployments.
+    - Local mode (deploy=False), we assume the server is running in a shared
+      environment (e.g. laptop) and users typically do not pay attention to
+      the resource setup of the server. Moreover, existing users may expect
+      some consistent behaviors with old versions, i.e. before API server was
+      introduced, so:
+      - The max number of long-running executor processes are limited, to avoid
+        high memory consumption when the server is idle.
+      - Allow burstable workers to handle requests when all long-running
+        workers are busy, which mimics the behavior of local sky CLI before
+        API server was introduced.
+      - Works in low resources environments, and further reduce the memory
+        consumption in low resource environments.
+    Note that there is still significant overhead for SDK users when migrate to
+    local API server. Since the users are free to run sky operations in Threads
+    when using SDK but all client operations will occupy at least one worker
+    process after API server was introduced.
+    """
+    cpu_count = common_utils.get_cpu_count()
+    mem_size_gb = common_utils.get_mem_size_gb()
+    max_parallel_for_long = _max_long_worker_parallism(cpu_count,
+                                                       mem_size_gb,
+                                                       local=not deploy)
+    max_parallel_for_short = _max_short_worker_parallism(
+        mem_size_gb, max_parallel_for_long)
+    queue_backend = QueueBackend.MULTIPROCESSING
+    burstable_parallel_for_long = 0
+    burstable_parallel_for_short = 0
+    num_server_workers = cpu_count
+    if not deploy:
+        # For local mode, use local queue backend since we only run 1 uvicorn
+        # worker in local mode and no multiprocessing is needed.
+        num_server_workers = 1
+        queue_backend = QueueBackend.LOCAL
+        # Enable burstable workers for local API server.
+        burstable_parallel_for_long = _BURSTABLE_WORKERS_FOR_LOCAL
+        burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
+        # Runs in low resource mode if the available memory is less than
+        # server_constants.MIN_AVAIL_MEM_GB.
+        if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
+            # Permanent worker process may have significant memory consumption
+            # (~350MB per worker) after running commands like `sky check`, so we
+            # don't start any permanent workers in low resource local mode. This
+            # mimics the behavior of local sky CLI before API server was
+            # introduced, where the CLI will start new process everytime and
+            # never reject to start due to resource constraints.
+            # Note that the refresh daemon will still occupy one worker
+            # permanently because it never exits.
+            max_parallel_for_long = 0
+            max_parallel_for_short = 0
+            logger.warning(
+                'SkyPilot API server will run in low resource mode because '
+                'the available memory is less than '
+                f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+    logger.info(
+        f'SkyPilot API server will start {num_server_workers} server processes '
+        f'with {max_parallel_for_long} background workers for long requests '
+        f'and will allow at max {max_parallel_for_short} short requests in '
+        f'parallel.')
+    return ServerConfig(
+        num_server_workers=num_server_workers,
+        queue_backend=queue_backend,
+        long_worker_config=WorkerConfig(
+            garanteed_parallelism=max_parallel_for_long,
+            burstable_parallelism=burstable_parallel_for_long),
+        short_worker_config=WorkerConfig(
+            garanteed_parallelism=max_parallel_for_short,
+            burstable_parallelism=burstable_parallel_for_short),
+    )
+def _max_long_worker_parallism(cpu_count: int,
+                               mem_size_gb: float,
+                               local=False) -> int:
+    """Max parallelism for long workers."""
+    # Reserve min available memory to avoid OOM.
+    available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
+    cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
+    mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
+                                 _LONG_WORKER_MEM_GB)
+    n = max(_MIN_LONG_WORKERS,
+            min(cpu_based_max_parallel, mem_based_max_parallel))
+    if local:
+        return min(n, _MAX_LONG_WORKERS_LOCAL)
+    return n
+def _max_short_worker_parallism(mem_size_gb: float,
+                                long_worker_parallism: int) -> int:
+    """Max parallelism for short workers."""
+    # Reserve memory for long workers and min available memory.
+    reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
+                                                        _LONG_WORKER_MEM_GB)
+    available_mem = max(0, mem_size_gb - reserved_mem)
+    n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
+    return n

sky/server/requests/executor.py CHANGED Viewed

@@ -19,7 +19,6 @@ The number of the workers is determined by the system resources.
 See the [README.md](../README.md) for detailed architecture of the executor.
 """
 import contextlib
-import enum
 import multiprocessing
 import os
 import queue as queue_lib
@@ -37,6 +36,7 @@ from sky import models
 from sky import sky_logging
 from sky import skypilot_config
 from sky.server import common as server_common
+from sky.server import config as server_config
 from sky.server import constants as server_constants
 from sky.server.requests import payloads
 from sky.server.requests import preconditions
@@ -70,53 +70,6 @@ logger = sky_logging.init_logger(__name__)
 # platforms, including macOS.
 multiprocessing.set_start_method('spawn', force=True)
-# Constants based on profiling the peak memory usage while serving various
-# sky commands. These estimation are highly related to usage patterns
-# (clouds enabled, type of requests, etc. see `tests/load_tests` for details.),
-# the profiling covers major clouds and common usage patterns. For user has
-# deviated usage pattern, they can override the default estimation by
-# environment variables.
-# NOTE(dev): update these constants for each release according to the load
-# test results.
-# TODO(aylei): maintaining these constants is error-prone, we may need to
-# automatically tune parallelism at runtime according to system usage stats
-# in the future.
-_LONG_WORKER_MEM_GB = 0.4
-_SHORT_WORKER_MEM_GB = 0.25
-# To control the number of long workers.
-_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
-# Limit the number of long workers of local API server, since local server is
-# typically:
-# 1. launched automatically in an environment with high resource contention
-#    (e.g. Laptop)
-# 2. used by a single user
-_MAX_LONG_WORKERS_LOCAL = 4
-# Percentage of memory for long requests
-# from the memory reserved for SkyPilot.
-# This is to reserve some memory for short requests.
-_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
-# Minimal number of long workers to ensure responsiveness.
-_MIN_LONG_WORKERS = 1
-# Minimal number of short workers, there is a daemon task running on short
-# workers so at least 2 workers are needed to ensure responsiveness.
-_MIN_SHORT_WORKERS = 2
-# Default number of burstable workers for local API server. A heuristic number
-# that is large enough for most local cases.
-# TODO(aylei): the number of burstable workers should be auto-tuned based on the
-# system usage stats.
-_BURSTABLE_WORKERS_FOR_LOCAL = 1024
-class QueueBackend(enum.Enum):
-    # Local queue backend serves queues in each process locally, which has
-    # lower resource usage but the consumer must be in the same process, i.e.
-    # this only works in single-process mode.
-    LOCAL = 'local'
-    # Multi-process queue backend starts a dedicated process for serving queues.
-    MULTIPROCESSING = 'multiprocessing'
-    # TODO(zhwu): we can add redis backend in the future.
 class RequestQueue:
     """The queue for the requests, either redis or multiprocessing.
@@ -126,12 +79,12 @@ class RequestQueue:
     def __init__(self,
                  schedule_type: api_requests.ScheduleType,
-                 backend: Optional[QueueBackend] = None) -> None:
+                 backend: Optional[server_config.QueueBackend] = None) -> None:
         self.name = schedule_type.value
         self.backend = backend
-        if backend == QueueBackend.MULTIPROCESSING:
+        if backend == server_config.QueueBackend.MULTIPROCESSING:
             self.queue = mp_queue.get_queue(self.name)
-        elif backend == QueueBackend.LOCAL:
+        elif backend == server_config.QueueBackend.LOCAL:
             self.queue = local_queue.get_queue(self.name)
         else:
             raise RuntimeError(f'Invalid queue backend: {backend}')
@@ -162,7 +115,7 @@ class RequestQueue:
         return self.queue.qsize()
-queue_backend = QueueBackend.MULTIPROCESSING
+queue_backend = server_config.QueueBackend.MULTIPROCESSING
 def executor_initializer(proc_group: str):
@@ -186,13 +139,11 @@ class RequestWorker:
     # if there are available CPU/memory resources.
     burstable_parallelism: int = 0
-    def __init__(self,
-                 schedule_type: api_requests.ScheduleType,
-                 garanteed_parallelism: int,
-                 burstable_parallelism: int = 0) -> None:
+    def __init__(self, schedule_type: api_requests.ScheduleType,
+                 config: server_config.WorkerConfig) -> None:
         self.schedule_type = schedule_type
-        self.garanteed_parallelism = garanteed_parallelism
-        self.burstable_parallelism = burstable_parallelism
+        self.garanteed_parallelism = config.garanteed_parallelism
+        self.burstable_parallelism = config.burstable_parallelism
     def __str__(self) -> str:
         return f'Worker(schedule_type={self.schedule_type.value})'
@@ -455,80 +406,17 @@ def schedule_request(
         enqueue()
-def start(deploy: bool) -> List[multiprocessing.Process]:
+def start(config: server_config.ServerConfig) -> List[multiprocessing.Process]:
     """Start the request workers.
     Request workers run in background, schedule the requests and delegate the
-    request execution to executor processes. We have different assumptions for
-    the resources in different deployment modes, which leads to different
-    worker setups:
-    - Deployment mode (deploy=True), we assume the resources are dedicated to
-      the API server and the resources will be tuned for serious use cases, so:
-      - Use multiprocessing queue backend and dedicated workers processes to
-        avoid GIL contention.
-      - Parallelism (number of executor processes) is fixed and executor
-        processes have same lifecycle with the server, which ensures
-        best-effort cache reusing and stable resources consumption.
-      - Reject to start in low resource environments, to avoid flaky
-        deployments.
-    - Local mode (deploy=False), we assume the server is running in a shared
-      environment (e.g. laptop) and users typically do not pay attention to
-      the resource setup of the server. Moreover, existing users may expect
-      some consistent behaviors with old versions, i.e. before API server was
-      introduced, so:
-      - The max number of long-running executor processes are limited, to avoid
-        high memory consumption when the server is idle.
-      - Allow burstable workers to handle requests when all long-running
-        workers are busy, which mimics the behavior of local sky CLI before
-        API server was introduced.
-      - Works in low resources environments, and further reduce the memory
-        consumption in low resource environments.
-    Note that there is still significant overhead for SDK users when migrate to
-    local API server. Since the users are free to run sky operations in Threads
-    when using SDK but all client operations will occupy at least one worker
-    process after API server was introduced.
+    request execution to executor processes.
     """
-    # Determine the job capacity of the workers based on the system resources.
-    cpu_count = common_utils.get_cpu_count()
-    mem_size_gb = common_utils.get_mem_size_gb()
-    mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
-    # Runs in low resource mode if the available memory is less than
-    # server_constants.MIN_AVAIL_MEM_GB.
-    max_parallel_for_long = _max_long_worker_parallism(cpu_count,
-                                                       mem_size_gb,
-                                                       local=not deploy)
-    max_parallel_for_short = _max_short_worker_parallism(
-        mem_size_gb, max_parallel_for_long)
-    if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
-        # Permanent worker process may have significant memory consumption
-        # (~350MB per worker) after running commands like `sky check`, so we
-        # don't start any permanent workers in low resource local mode. This
-        # mimics the behavior of local sky CLI before API server was
-        # introduced, where the CLI will start new process everytime and
-        # never reject to start due to resource constraints.
-        # Note that the refresh daemon will still occupy one worker
-        # permanently because it never exits.
-        max_parallel_for_long = 0
-        max_parallel_for_short = 0
-        logger.warning(
-            'SkyPilot API server will run in low resource mode because '
-            'the available memory is less than '
-            f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
-    else:
-        logger.info(
-            f'SkyPilot API server will start {max_parallel_for_long} workers '
-            f'for long requests and will allow at max '
-            f'{max_parallel_for_short} short requests in parallel.')
-    if not deploy:
-        # For local mode, use local queue backend since we only run 1 uvicorn
-        # worker in local mode.
-        global queue_backend
-        queue_backend = QueueBackend.LOCAL
+    global queue_backend
+    queue_backend = config.queue_backend
     sub_procs = []
     # Setup the queues.
-    if queue_backend == QueueBackend.MULTIPROCESSING:
+    if queue_backend == server_config.QueueBackend.MULTIPROCESSING:
         logger.info('Creating shared request queues')
         queue_names = [
             schedule_type.value for schedule_type in api_requests.ScheduleType
@@ -547,7 +435,7 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
         mp_queue.wait_for_queues_to_be_ready(queue_names,
                                              queue_server,
                                              port=port)
-    elif queue_backend == QueueBackend.LOCAL:
+    elif queue_backend == server_config.QueueBackend.LOCAL:
         # No setup is needed for local queue backend.
         pass
     else:
@@ -563,40 +451,13 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
         thread = threading.Thread(target=worker.run, daemon=True)
         thread.start()
-    burstable_parallelism = _BURSTABLE_WORKERS_FOR_LOCAL if not deploy else 0
     # Start a worker for long requests.
     long_worker = RequestWorker(schedule_type=api_requests.ScheduleType.LONG,
-                                garanteed_parallelism=max_parallel_for_long,
-                                burstable_parallelism=burstable_parallelism)
+                                config=config.long_worker_config)
     run_worker_in_background(long_worker)
     # Start a worker for short requests.
     short_worker = RequestWorker(schedule_type=api_requests.ScheduleType.SHORT,
-                                 garanteed_parallelism=max_parallel_for_short,
-                                 burstable_parallelism=burstable_parallelism)
+                                 config=config.short_worker_config)
     run_worker_in_background(short_worker)
     return sub_procs
-@annotations.lru_cache(scope='global', maxsize=1)
-def _max_long_worker_parallism(cpu_count: int,
-                               mem_size_gb: float,
-                               local=False) -> int:
-    """Max parallelism for long workers."""
-    cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
-    mem_based_max_parallel = int(mem_size_gb * _MAX_MEM_PERCENT_FOR_BLOCKING /
-                                 _LONG_WORKER_MEM_GB)
-    n = max(_MIN_LONG_WORKERS,
-            min(cpu_based_max_parallel, mem_based_max_parallel))
-    if local:
-        return min(n, _MAX_LONG_WORKERS_LOCAL)
-    return n
-@annotations.lru_cache(scope='global', maxsize=1)
-def _max_short_worker_parallism(mem_size_gb: float,
-                                long_worker_parallism: int) -> int:
-    """Max parallelism for short workers."""
-    available_mem = mem_size_gb - (long_worker_parallism * _LONG_WORKER_MEM_GB)
-    n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
-    return n

sky/server/server.py CHANGED Viewed

@@ -35,6 +35,7 @@ from sky.jobs.server import server as jobs_rest
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.serve.server import server as serve_rest
 from sky.server import common
+from sky.server import config as server_config
 from sky.server import constants as server_constants
 from sky.server import stream_utils
 from sky.server.requests import executor
@@ -1166,13 +1167,12 @@ if __name__ == '__main__':
     # that it is shown only when the API server is started.
     usage_lib.maybe_show_privacy_policy()
-    num_workers = 1
-    if cmd_args.deploy:
-        num_workers = common_utils.get_cpu_count()
+    config = server_config.compute_server_config(cmd_args.deploy)
+    num_workers = config.num_server_workers
     sub_procs = []
     try:
-        sub_procs = executor.start(deploy=cmd_args.deploy)
+        sub_procs = executor.start(config)
         logger.info(f'Starting SkyPilot API server, workers={num_workers}')
         # We don't support reload for now, since it may cause leakage of request
         # workers or interrupt running requests.

sky/setup_files/dependencies.py CHANGED Viewed

@@ -53,7 +53,6 @@ install_requires = [
     'aiofiles',
     'httpx',
     'setproctitle',
-    'omegaconf>=2.4.0dev3,<2.5',
 ]
 local_ray = [

sky/skylet/constants.py CHANGED Viewed

@@ -368,6 +368,13 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
     'Failed to assign Storage Blob Data Owner role to the '
     'storage account {storage_account_name}.')
+# Constants for path in K8S pod to store persistent setup and run scripts
+# so that we can run them again after the pod restarts.
+# Path within user home. For HA controller, assumes home directory is
+# persistent through PVC. See kubernetes-ray.yml.j2.
+PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
+PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
 # The placeholder for the local skypilot config path in file mounts for
 # controllers.
 LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'

sky/skypilot_config.py CHANGED Viewed

@@ -56,8 +56,6 @@ import threading
 import typing
 from typing import Any, Dict, Iterator, List, Optional, Tuple
-from omegaconf import OmegaConf
 from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
@@ -321,6 +319,31 @@ def _parse_config_file(config_path: str) -> config_utils.Config:
     return config
+def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
+    """Parse a comma-separated list of key-value pairs into a dictionary.
+    Args:
+        dotlist: A comma-separated list of key-value pairs.
+    Returns:
+        A config_utils.Config object with the parsed key-value pairs.
+    """
+    config: config_utils.Config = config_utils.Config()
+    for arg in dotlist:
+        try:
+            key, value = arg.split('=', 1)
+        except ValueError as e:
+            raise ValueError(f'Invalid config override: {arg}. '
+                             'Please use the format: key=value') from e
+        if len(key) == 0 or len(value) == 0:
+            raise ValueError(f'Invalid config override: {arg}. '
+                             'Please use the format: key=value')
+        value = yaml.safe_load(value)
+        nested_keys = tuple(key.split('.'))
+        config.set_nested(nested_keys, value)
+    return config
 def _reload_config_from_internal_file(internal_config_path: str) -> None:
     global _dict, _loaded_config_path
     # Reset the global variables, to avoid using stale values.
@@ -483,11 +506,9 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
                     'Cannot use multiple --config flags with a config file.')
             config_source = maybe_config_path
             # cli_config is a path to a config file
-            parsed_config = OmegaConf.to_object(
-                OmegaConf.load(maybe_config_path))
+            parsed_config = _parse_config_file(maybe_config_path)
         else:  # cli_config is a comma-separated list of key-value pairs
-            parsed_config = OmegaConf.to_object(
-                OmegaConf.from_dotlist(cli_config))
+            parsed_config = _parse_dotlist(cli_config)
         _validate_config(parsed_config, config_source)
     except ValueError as e:
         raise ValueError(f'Invalid config override: {cli_config}. '

sky/task.py CHANGED Viewed

@@ -306,7 +306,7 @@ class Task:
         self.service_name: Optional[str] = None
         # Filled in by the optimizer.  If None, this Task is not planned.
-        self.best_resources = None
+        self.best_resources: Optional[sky.Resources] = None
         # For internal use only.
         self.file_mounts_mapping = file_mounts_mapping

skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl