PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +59 -149
sky/backends/backend_utils.py +104 -63
sky/backends/cloud_vm_ray_backend.py +84 -39
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +25 -13
sky/client/cli/command.py +335 -86
sky/client/cli/flags.py +4 -2
sky/client/cli/table_utils.py +17 -9
sky/client/sdk.py +59 -12
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +71 -16
sky/clouds/azure.py +12 -5
sky/clouds/cloud.py +19 -9
sky/clouds/cudo.py +12 -5
sky/clouds/do.py +4 -1
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +12 -5
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +62 -25
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +12 -5
sky/clouds/oci.py +12 -5
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +4 -1
sky/clouds/runpod.py +12 -5
sky/clouds/scp.py +12 -5
sky/clouds/seeweb.py +4 -1
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/vast.py +12 -5
sky/clouds/vsphere.py +4 -1
sky/core.py +12 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +143 -19
sky/data/storage.py +168 -11
sky/exceptions.py +13 -1
sky/execution.py +13 -0
sky/global_user_state.py +189 -113
sky/jobs/client/sdk.py +32 -10
sky/jobs/client/sdk_async.py +9 -3
sky/jobs/constants.py +3 -1
sky/jobs/controller.py +164 -192
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +20 -9
sky/jobs/server/core.py +105 -23
sky/jobs/server/server.py +40 -28
sky/jobs/server/utils.py +32 -11
sky/jobs/state.py +588 -110
sky/jobs/utils.py +442 -209
sky/logs/agent.py +1 -1
sky/metrics/utils.py +45 -6
sky/optimizer.py +1 -1
sky/provision/__init__.py +7 -0
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +2 -1
sky/provision/do/instance.py +2 -1
sky/provision/fluidstack/instance.py +4 -3
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +222 -89
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/utils.py +114 -53
sky/provision/kubernetes/volume.py +5 -4
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -1
sky/provision/provisioner.py +11 -2
sky/provision/runpod/instance.py +2 -1
sky/provision/scp/instance.py +2 -1
sky/provision/seeweb/instance.py +3 -3
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +2 -1
sky/resources.py +1 -1
sky/schemas/api/responses.py +9 -5
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/client/impl.py +11 -3
sky/serve/replica_managers.py +5 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/impl.py +7 -2
sky/serve/server/server.py +18 -15
sky/serve/service.py +2 -2
sky/server/auth/oauth2_proxy.py +2 -5
sky/server/common.py +31 -28
sky/server/constants.py +5 -1
sky/server/daemons.py +27 -19
sky/server/requests/executor.py +138 -74
sky/server/requests/payloads.py +9 -1
sky/server/requests/preconditions.py +13 -10
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +485 -153
sky/server/requests/serializers/decoders.py +26 -13
sky/server/requests/serializers/encoders.py +56 -11
sky/server/requests/threads.py +106 -0
sky/server/rest.py +70 -18
sky/server/server.py +283 -104
sky/server/stream_utils.py +233 -59
sky/server/uvicorn.py +18 -17
sky/setup_files/alembic.ini +4 -0
sky/setup_files/dependencies.py +32 -13
sky/sky_logging.py +0 -2
sky/skylet/constants.py +30 -7
sky/skylet/events.py +7 -0
sky/skylet/log_lib.py +8 -2
sky/skylet/log_lib.pyi +1 -1
sky/skylet/services.py +26 -13
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +87 -75
sky/ssh_node_pools/server.py +9 -8
sky/task.py +67 -54
sky/templates/kubernetes-ray.yml.j2 +8 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/websocket_proxy.py +142 -12
sky/users/permission.py +8 -1
sky/utils/admin_policy_utils.py +16 -3
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/command_runner.py +11 -0
sky/utils/common.py +3 -1
sky/utils/common_utils.py +7 -4
sky/utils/context.py +57 -51
sky/utils/context_utils.py +30 -12
sky/utils/controller_utils.py +35 -8
sky/utils/db/db_utils.py +37 -10
sky/utils/db/migration_utils.py +8 -4
sky/utils/locks.py +24 -6
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/utils/subprocess_utils.py +17 -4
sky/volumes/server/server.py +7 -6
sky/workspaces/server.py +13 -12
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/jobs/controller.py CHANGED Viewed

@@ -1,16 +1,17 @@
 """Controller: handles scheduling and the life cycle of a managed job.
 """
 import asyncio
-import logging
+import io
 import os
 import pathlib
 import resource
 import shutil
 import sys
+import threading
 import time
 import traceback
 import typing
-from typing import Dict, Optional, Set, Tuple
+from typing import Dict, Optional, Set
 import dotenv
@@ -23,6 +24,8 @@ from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.data import data_utils
 from sky.jobs import constants as jobs_constants
+from sky.jobs import file_content_utils
+from sky.jobs import log_gc
 from sky.jobs import recovery_strategy
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
@@ -30,6 +33,7 @@ from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
+from sky.utils import annotations
 from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import context
@@ -62,17 +66,26 @@ async def create_background_task(coro: typing.Coroutine) -> None:
         task.add_done_callback(_background_tasks.discard)
-def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
-    dag = dag_utils.load_chain_dag_from_yaml(dag_yaml)
-    dag_name = dag.name
-    assert dag_name is not None, dag
-    return dag, dag_name
+# Make sure to limit the size as we don't want to cache too many DAGs in memory.
+@annotations.lru_cache(scope='global', maxsize=50)
+def _get_dag(job_id: int) -> 'sky.Dag':
+    dag_content = file_content_utils.get_job_dag_content(job_id)
+    if dag_content is None:
+        raise RuntimeError('Managed job DAG YAML content is unavailable for '
+                           f'job {job_id}. This can happen if the job was '
+                           'submitted before file migration completed or if '
+                           'the submission failed to persist the DAG. Please '
+                           're-submit the job.')
+    dag = dag_utils.load_chain_dag_from_yaml_str(dag_content)
+    assert dag.name is not None, dag
+    return dag
-class JobsController:
+class JobController:
     """Controls the lifecycle of a single managed job.
-    This controller executes a chain DAG defined in ``dag_yaml`` by:
+    This controller executes the chain DAG recorded for the job by:
     - Loading the DAG and preparing per-task environment variables so each task
       has a stable global job identifier across recoveries.
     - Launching the task on the configured backend (``CloudVmRayBackend``),
@@ -92,10 +105,10 @@ class JobsController:
     Key attributes:
     - ``_job_id``: Integer identifier of this managed job.
-    - ``_dag_yaml`` / ``_dag`` / ``_dag_name``: The job definition and metadata.
+    - ``_dag`` / ``_dag_name``: The job definition and metadata loaded from the
+      database-backed job YAML.
     - ``_backend``: Backend used to launch and manage clusters.
     - ``_pool``: Optional pool name if using a cluster pool.
-    - ``_logger``: Job-scoped logger for progress and diagnostics.
     - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
       coordination primitives. ``starting_lock`` must be used for accessing
       ``starting_signal`` and ``starting``
@@ -106,8 +119,6 @@ class JobsController:
     def __init__(
         self,
         job_id: int,
-        dag_yaml: str,
-        job_logger: logging.Logger,
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
@@ -117,8 +128,6 @@ class JobsController:
         Args:
             job_id: Integer ID of the managed job.
-            dag_yaml: Path to the YAML file containing the chain DAG to run.
-            job_logger: Logger instance dedicated to this job.
             starting: Shared set of job IDs currently in the STARTING phase,
                 used to limit concurrent launches.
             starting_lock: ``asyncio.Lock`` guarding access to the shared
@@ -134,14 +143,12 @@ class JobsController:
         self.starting_lock = starting_lock
         self.starting_signal = starting_signal
-        self._logger = job_logger
-        self._logger.info(f'Initializing JobsController for job_id={job_id}, '
-                          f'dag_yaml={dag_yaml}')
+        logger.info('Initializing JobsController for job_id=%s', job_id)
         self._job_id = job_id
-        self._dag_yaml = dag_yaml
-        self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
-        self._logger.info(f'Loaded DAG: {self._dag}')
+        self._dag = _get_dag(job_id)
+        self._dag_name = self._dag.name
+        logger.info(f'Loaded DAG: {self._dag}')
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
         self._pool = pool
@@ -191,8 +198,8 @@ class JobsController:
         preemptions or ssh disconnection during the streaming.
         """
         if handle is None:
-            self._logger.info(f'Cluster for job {self._job_id} is not found. '
-                              'Skipping downloading and streaming the logs.')
+            logger.info(f'Cluster for job {self._job_id} is not found. '
+                        'Skipping downloading and streaming the logs.')
             return
         managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -210,11 +217,11 @@ class JobsController:
             managed_job_state.set_local_log_file(self._job_id, task_id,
                                                  log_file)
         else:
-            self._logger.warning(
+            logger.warning(
                 f'No log file was downloaded for job {self._job_id}, '
                 f'task {task_id}')
-        self._logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
+        logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
     async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
         if cluster_name is None:
@@ -259,7 +266,7 @@ class JobsController:
         Other exceptions may be raised depending on the backend.
         """
         task_start_time = time.time()
-        self._logger.info(
+        logger.info(
             f'Starting task {task_id} ({task.name}) for job {self._job_id}')
         latest_task_id, last_task_prev_status = (
@@ -271,22 +278,20 @@ class JobsController:
                 managed_job_state.ManagedJobStatus.PENDING):
             assert latest_task_id >= task_id, (latest_task_id, task_id)
             if latest_task_id > task_id:
-                self._logger.info(f'Task {task_id} ({task.name}) has already '
-                                  'been executed. Skipping...')
+                logger.info(f'Task {task_id} ({task.name}) has already '
+                            'been executed. Skipping...')
                 return True
             if latest_task_id == task_id:
                 # Start recovery.
                 is_resume = True
-                self._logger.info(
-                    f'Resuming task {task_id} from previous execution')
+                logger.info(f'Resuming task {task_id} from previous execution')
         callback_func = managed_job_utils.event_callback_func(
             job_id=self._job_id, task_id=task_id, task=task)
         if task.run is None:
-            self._logger.info(
-                f'Skip running task {task_id} ({task.name}) due to its '
-                'run commands being empty.')
+            logger.info(f'Skip running task {task_id} ({task.name}) due to its '
+                        'run commands being empty.')
             # Call set_started first to initialize columns in the state table,
             # including start_at and last_recovery_at to avoid issues for
             # uninitialized columns.
@@ -300,8 +305,7 @@ class JobsController:
                 task_id=task_id,
                 end_time=time.time(),
                 callback_func=callback_func)
-            self._logger.info(
-                f'Empty task {task_id} marked as succeeded immediately')
+            logger.info(f'Empty task {task_id} marked as succeeded immediately')
             return True
         usage_lib.messages.usage.update_task_id(task_id)
@@ -314,8 +318,7 @@ class JobsController:
             task.name, self._job_id) if self._pool is None else None
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
             cluster_name, self._backend, task, self._job_id, task_id,
-            self._logger, self._pool, self.starting, self.starting_lock,
-            self.starting_signal)
+            self._pool, self.starting, self.starting_lock, self.starting_signal)
         if not is_resume:
             submitted_at = time.time()
             if task_id == 0:
@@ -336,11 +339,11 @@ class JobsController:
                         self._strategy_executor.max_restarts_on_errors
                 },
                 callback_func=callback_func)
-            self._logger.info(f'Submitted managed job {self._job_id} '
-                              f'(task: {task_id}, name: {task.name!r}); '
-                              f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
+            logger.info(f'Submitted managed job {self._job_id} '
+                        f'(task: {task_id}, name: {task.name!r}); '
+                        f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
-        self._logger.info('Started monitoring.')
+        logger.info('Started monitoring.')
         # Only do the initial cluster launch if not resuming from a controller
         # failure. Otherwise, we will transit to recovering immediately.
@@ -354,7 +357,7 @@ class JobsController:
             remote_job_submitted_at = await self._strategy_executor.launch()
             launch_time = time.time() - launch_start
-            self._logger.info(f'Cluster launch completed in {launch_time:.2f}s')
+            logger.info(f'Cluster launch completed in {launch_time:.2f}s')
             assert remote_job_submitted_at is not None, remote_job_submitted_at
         if self._pool is None:
             job_id_on_pool_cluster = None
@@ -367,16 +370,16 @@ class JobsController:
             # Check if we have been cancelled here, in the case where a user
             # quickly cancels the job we want to gracefully handle it here,
             # otherwise we will end up in the FAILED_CONTROLLER state.
-            self._logger.info(f'Cluster name is None for job {self._job_id}, '
-                              f'task {task_id}. Checking if we have been '
-                              'cancelled.')
+            logger.info(f'Cluster name is None for job {self._job_id}, '
+                        f'task {task_id}. Checking if we have been '
+                        'cancelled.')
             status = await (managed_job_state.get_job_status_with_task_id_async(
                 job_id=self._job_id, task_id=task_id))
-            self._logger.debug(f'Status for job {self._job_id}, task {task_id}:'
-                               f'{status}')
+            logger.debug(f'Status for job {self._job_id}, task {task_id}:'
+                         f'{status}')
             if status == managed_job_state.ManagedJobStatus.CANCELLED:
-                self._logger.info(f'Job {self._job_id}, task {task_id} has '
-                                  'been quickly cancelled.')
+                logger.info(f'Job {self._job_id}, task {task_id} has '
+                            'been quickly cancelled.')
                 raise asyncio.CancelledError()
         assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
@@ -417,7 +420,7 @@ class JobsController:
                 if prev_status is not None:
                     if prev_status.is_terminal():
-                        self._logger.info(
+                        logger.info(
                             f'Task {task_id} already in terminal state: '
                             f'{prev_status}')
                         return (prev_status ==
@@ -427,9 +430,8 @@ class JobsController:
                         # If the controller is down when cancelling the job,
                         # we re-raise the error to run the `_cleanup` function
                         # again to clean up any remaining resources.
-                        self._logger.info(
-                            f'Task {task_id} was being cancelled, '
-                            're-raising cancellation')
+                        logger.info(f'Task {task_id} was being cancelled, '
+                                    're-raising cancellation')
                         raise asyncio.CancelledError()
                 if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
                     force_transit_to_recovering = True
@@ -443,10 +445,9 @@ class JobsController:
             try:
                 await backend_utils.async_check_network_connection()
             except exceptions.NetworkError:
-                self._logger.info(
-                    'Network is not available. Retrying again in '
-                    f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
-                    'seconds.')
+                logger.info('Network is not available. Retrying again in '
+                            f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
+                            'seconds.')
                 continue
             # NOTE: we do not check cluster status first because race condition
@@ -461,23 +462,22 @@ class JobsController:
                         self._backend,
                         cluster_name,
                         job_id=job_id_on_pool_cluster,
-                        job_logger=self._logger,
                     )
                 except exceptions.FetchClusterInfoError as fetch_e:
-                    self._logger.info(
+                    logger.info(
                         'Failed to fetch the job status. Start recovery.\n'
                         f'Exception: {common_utils.format_exception(fetch_e)}\n'
                         f'Traceback: {traceback.format_exc()}')
             if job_status == job_lib.JobStatus.SUCCEEDED:
-                self._logger.info(f'Task {task_id} succeeded! '
-                                  'Getting end time and cleaning up')
+                logger.info(f'Task {task_id} succeeded! '
+                            'Getting end time and cleaning up')
                 try:
                     success_end_time = await context_utils.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
-                    self._logger.warning(
+                    logger.warning(
                         f'Failed to get job end time: '
                         f'{common_utils.format_exception(e)}',
                         exc_info=True)
@@ -490,7 +490,7 @@ class JobsController:
                     task_id,
                     end_time=success_end_time,
                     callback_func=callback_func)
-                self._logger.info(
+                logger.info(
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
                 try:
@@ -511,7 +511,7 @@ class JobsController:
                             job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
                     # We don't want to crash here, so just log and continue.
-                    self._logger.warning(
+                    logger.warning(
                         f'Failed to download and stream logs: '
                         f'{common_utils.format_exception(e)}',
                         exc_info=True)
@@ -521,10 +521,10 @@ class JobsController:
                 task_total_time = time.time() - task_start_time
                 monitoring_time = time.time() - monitoring_start_time
-                self._logger.info(f'Task {task_id} completed successfully in '
-                                  f'{task_total_time:.2f}s '
-                                  f'(monitoring time: {monitoring_time:.2f}s, '
-                                  f'status checks: {status_check_count})')
+                logger.info(f'Task {task_id} completed successfully in '
+                            f'{task_total_time:.2f}s '
+                            f'(monitoring time: {monitoring_time:.2f}s, '
+                            f'status checks: {status_check_count})')
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -560,7 +560,7 @@ class JobsController:
                 # code).
                 cluster_status_str = ('' if cluster_status is None else
                                       f' (status: {cluster_status.value})')
-                self._logger.info(
+                logger.info(
                     f'Cluster is preempted or failed{cluster_status_str}. '
                     'Recovering...')
             else:
@@ -571,12 +571,12 @@ class JobsController:
                       in job_lib.JobStatus.user_code_failure_states() or
                       job_status == job_lib.JobStatus.FAILED_DRIVER):
                     # The user code has probably crashed, fail immediately.
-                    self._logger.info(
+                    logger.info(
                         f'Task {task_id} failed with status: {job_status}')
                     end_time = await context_utils.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
-                    self._logger.info(
+                    logger.info(
                         f'The user job failed ({job_status}). Please check the '
                         'logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
@@ -611,7 +611,7 @@ class JobsController:
                     if should_restart_on_failure:
                         max_restarts = (
                             self._strategy_executor.max_restarts_on_errors)
-                        self._logger.info(
+                        logger.info(
                             f'User program crashed '
                             f'({managed_job_status.value}). '
                             f'Retry the job as max_restarts_on_errors is '
@@ -619,7 +619,7 @@ class JobsController:
                             f'[{self._strategy_executor.restart_cnt_on_failure}'
                             f'/{max_restarts}]')
                     else:
-                        self._logger.info(
+                        logger.info(
                             f'Task {task_id} failed and will not be retried')
                         await managed_job_state.set_failed_async(
                             self._job_id,
@@ -632,7 +632,7 @@ class JobsController:
                 elif job_status is not None:
                     # Either the job is cancelled (should not happen) or in some
                     # unknown new state that we do not handle.
-                    self._logger.error(f'Unknown job status: {job_status}')
+                    logger.error(f'Unknown job status: {job_status}')
                     failure_reason = (
                         f'Unknown job status {job_status}. To see the details, '
                         f'run: sky jobs logs --controller {self._job_id}')
@@ -649,10 +649,9 @@ class JobsController:
                     # job status. Try to recover the job (will not restart the
                     # cluster, if the cluster is healthy).
                     assert job_status is None, job_status
-                    self._logger.info(
-                        'Failed to fetch the job status while the '
-                        'cluster is healthy. Try to recover the job '
-                        '(the cluster will not be restarted).')
+                    logger.info('Failed to fetch the job status while the '
+                                'cluster is healthy. Try to recover the job '
+                                '(the cluster will not be restarted).')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources
@@ -671,15 +670,14 @@ class JobsController:
                     # Some spot resource (e.g., Spot TPU VM) may need to be
                     # cleaned up after preemption, as running launch again on
                     # those clusters again may fail.
-                    self._logger.info(
-                        'Cleaning up the preempted or failed cluster'
-                        '...')
+                    logger.info('Cleaning up the preempted or failed cluster'
+                                '...')
                     await self._cleanup_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
-            self._logger.info(f'Starting recovery for task {task_id}, '
-                              f'it is currently {job_status}')
+            logger.info(f'Starting recovery for task {task_id}, '
+                        f'it is currently {job_status}')
             await managed_job_state.set_recovering_async(
                 job_id=self._job_id,
                 task_id=task_id,
@@ -701,7 +699,7 @@ class JobsController:
     async def run(self):
         """Run controller logic and handle exceptions."""
-        self._logger.info(f'Starting JobsController run for job {self._job_id}')
+        logger.info(f'Starting JobsController run for job {self._job_id}')
         task_id = 0
         cancelled = False
@@ -709,39 +707,36 @@ class JobsController:
             succeeded = True
             # We support chain DAGs only for now.
             for task_id, task in enumerate(self._dag.tasks):
-                self._logger.info(
+                logger.info(
                     f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
                     f'{task.name}')
                 task_start = time.time()
                 succeeded = await self._run_one_task(task_id, task)
                 task_time = time.time() - task_start
-                self._logger.info(
-                    f'Task {task_id} completed in {task_time:.2f}s '
-                    f'with success={succeeded}')
+                logger.info(f'Task {task_id} completed in {task_time:.2f}s '
+                            f'with success={succeeded}')
                 if not succeeded:
-                    self._logger.info(
-                        f'Task {task_id} failed, stopping execution')
+                    logger.info(f'Task {task_id} failed, stopping execution')
                     break
         except exceptions.ProvisionPrechecksError as e:
             # Please refer to the docstring of self._run for the cases when
             # this exception can occur.
-            self._logger.error(f'Provision prechecks failed for task {task_id}')
+            logger.error(f'Provision prechecks failed for task {task_id}')
             failure_reason = ('; '.join(
                 common_utils.format_exception(reason, use_bracket=True)
                 for reason in e.reasons))
-            self._logger.error(failure_reason)
+            logger.error(failure_reason)
             await self._update_failed_task_state(
                 task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
                 failure_reason)
         except exceptions.ManagedJobReachedMaxRetriesError as e:
             # Please refer to the docstring of self._run for the cases when
             # this exception can occur.
-            self._logger.error(
-                f'Managed job reached max retries for task {task_id}')
+            logger.error(f'Managed job reached max retries for task {task_id}')
             failure_reason = common_utils.format_exception(e)
-            self._logger.error(failure_reason)
+            logger.error(failure_reason)
             # The managed job should be marked as FAILED_NO_RESOURCE, as the
             # managed job may be able to launch next time.
             await self._update_failed_task_state(
@@ -753,13 +748,13 @@ class JobsController:
             cancelled = True
             raise
         except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-            self._logger.error(
+            logger.error(
                 f'Unexpected error in JobsController run for task {task_id}')
             with ux_utils.enable_traceback():
-                self._logger.error(traceback.format_exc())
+                logger.error(traceback.format_exc())
             msg = ('Unexpected error occurred: ' +
                    common_utils.format_exception(e, use_bracket=True))
-            self._logger.error(msg)
+            logger.error(msg)
             await self._update_failed_task_state(
                 task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
                 msg)
@@ -783,8 +778,8 @@ class JobsController:
             failure_type: managed_job_state.ManagedJobStatus,
             failure_reason: str):
         """Update the state of the failed task."""
-        self._logger.info(f'Updating failed task state: task_id={task_id}, '
-                          f'failure_type={failure_type}')
+        logger.info(f'Updating failed task state: task_id={task_id}, '
+                    f'failure_type={failure_type}')
         await managed_job_state.set_failed_async(
             self._job_id,
             task_id=task_id,
@@ -796,10 +791,14 @@ class JobsController:
                 task=self._dag.tasks[task_id]))
-class Controller:
-    """Controller for managing jobs."""
+class ControllerManager:
+    """Main loop for a job controller process.
+    Many jobs will be handled by this, each by a single JobController.
+    """
-    def __init__(self) -> None:
+    def __init__(self, controller_uuid: str) -> None:
+        self._controller_uuid = controller_uuid
         # Global state for active jobs
         self.job_tasks: Dict[int, asyncio.Task] = {}
         self.starting: Set[int] = set()
@@ -813,11 +812,9 @@ class Controller:
         # launch).
         self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
-    async def _cleanup(self,
-                       job_id: int,
-                       dag_yaml: str,
-                       job_logger: logging.Logger,
-                       pool: Optional[str] = None):
+        self._pid = os.getpid()
+    async def _cleanup(self, job_id: int, pool: Optional[str] = None):
         """Clean up the cluster(s) and storages.
         (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -842,14 +839,13 @@ class Controller:
                     cluster_name = (
                         managed_job_utils.generate_managed_job_cluster_name(
                             task.name, job_id))
-                    managed_job_utils.terminate_cluster(cluster_name,
-                                                        _logger=job_logger)
+                    managed_job_utils.terminate_cluster(cluster_name)
                     status = core.status(cluster_names=[cluster_name],
                                          all_users=True)
                     assert (len(status) == 0 or
                             status[0]['status'] == sky.ClusterStatus.STOPPED), (
                                 f'{cluster_name} is not down: {status}')
-                    job_logger.info(f'{cluster_name} is down')
+                    logger.info(f'{cluster_name} is down')
                 else:
                     cluster_name, job_id_on_pool_cluster = (
                         managed_job_state.get_pool_submit_info(job_id))
@@ -860,7 +856,7 @@ class Controller:
                                         _try_cancel_if_cluster_is_init=True)
             except Exception as e:  # pylint: disable=broad-except
                 error = e
-                job_logger.warning(
+                logger.warning(
                     f'Failed to terminate cluster {cluster_name}: {e}')
                 # we continue to try cleaning up whatever else we can.
             # Clean up Storages with persistent=False.
@@ -874,7 +870,7 @@ class Controller:
                 for storage in task.storage_mounts.values():
                     storage.construct()
             except (exceptions.StorageSpecError, exceptions.StorageError) as e:
-                job_logger.warning(
+                logger.warning(
                     f'Failed to construct storage object for teardown: {e}\n'
                     'This may happen because storage construction already '
                     'failed during launch, storage was deleted externally, '
@@ -884,7 +880,7 @@ class Controller:
                 backend.teardown_ephemeral_storage(task)
             except Exception as e:  # pylint: disable=broad-except
                 error = e
-                job_logger.warning(f'Failed to teardown ephemeral storage: {e}')
+                logger.warning(f'Failed to teardown ephemeral storage: {e}')
                 # we continue to try cleaning up whatever else we can.
             # Clean up any files mounted from the local disk, such as two-hop
@@ -902,13 +898,13 @@ class Controller:
                         else:
                             os.remove(path)
                 except Exception as e:  # pylint: disable=broad-except
-                    job_logger.warning(
+                    logger.warning(
                         f'Failed to clean up file mount {file_mount}: {e}')
             if error is not None:
                 raise error
-        dag, _ = _get_dag_and_name(dag_yaml)
+        dag = _get_dag(job_id)
         error = None
         for task in dag.tasks:
             # most things in this function are blocking
@@ -924,58 +920,52 @@ class Controller:
     # Use context.contextual to enable per-job output redirection and env var
     # isolation.
-    @context.contextual
+    @context.contextual_async
     async def run_job_loop(self,
                            job_id: int,
-                           dag_yaml: str,
-                           job_logger: logging.Logger,
                            log_file: str,
-                           env_file_path: Optional[str] = None,
                            pool: Optional[str] = None):
         """Background task that runs the job loop."""
         ctx = context.get()
         assert ctx is not None, 'Context is not initialized'
         ctx.redirect_log(pathlib.Path(log_file))
-        # Load and apply environment variables from the job's environment file
-        if env_file_path and os.path.exists(env_file_path):
-            try:
-                # Load environment variables from the file
-                env_vars = dotenv.dotenv_values(env_file_path)
-                job_logger.info(f'Loading environment from {env_file_path}: '
-                                f'{list(env_vars.keys())}')
+        logger.info('Starting job loop for %s', job_id)
+        logger.info('  log_file=%s', log_file)
+        logger.info('  pool=%s', pool)
+        logger.info(f'From controller {self._controller_uuid}')
+        logger.info(f'  pid={self._pid}')
-                # Apply environment variables to the job's context
+        env_content = file_content_utils.get_job_env_content(job_id)
+        if env_content:
+            try:
+                env_vars = dotenv.dotenv_values(stream=io.StringIO(env_content))
+                logger.info('Loading %d environment variables for job %s',
+                            len(env_vars), job_id)
                 if ctx is not None:
                     for key, value in env_vars.items():
                         if value is not None:
                             ctx.override_envs({key: value})
-                            job_logger.debug(
-                                f'Set environment variable: {key}={value}')
-                    # Reload the skypilot config for this context to make sure
-                    # the latest config is used.
+                            logger.debug('Set environment variable: %s=%s', key,
+                                         value)
                     skypilot_config.reload_config()
-                else:
-                    job_logger.error(
-                        'Context is None, cannot set environment variables')
+                else:  # pragma: no cover - defensive
+                    logger.error('Context is None, cannot set environment '
+                                 'variables')
             except Exception as e:  # pylint: disable=broad-except
-                job_logger.error(
-                    f'Failed to load environment file {env_file_path}: {e}')
-        elif env_file_path:
-            job_logger.error(f'Environment file not found: {env_file_path}')
+                logger.error(
+                    'Failed to load environment variables for job %s: '
+                    '%s', job_id, e)
         cancelling = False
         try:
-            job_logger.info(f'Starting job loop for {job_id}')
-            controller = JobsController(job_id, dag_yaml, job_logger,
-                                        self.starting, self._job_tasks_lock,
-                                        self._starting_signal, pool)
+            controller = JobController(job_id, self.starting,
+                                       self._job_tasks_lock,
+                                       self._starting_signal, pool)
             async with self._job_tasks_lock:
                 if job_id in self.job_tasks:
-                    job_logger.error(
-                        f'Job {job_id} already exists in job_tasks')
+                    logger.error(f'Job {job_id} already exists in job_tasks')
                     raise ValueError(f'Job {job_id} already exists')
                 # Create the task and store it
@@ -985,13 +975,13 @@ class Controller:
                 self.job_tasks[job_id] = task
             await task
         except asyncio.CancelledError:
-            job_logger.info(f'Job {job_id} was cancelled')
-            dag, _ = _get_dag_and_name(dag_yaml)
+            logger.info(f'Job {job_id} was cancelled')
+            dag = _get_dag(job_id)
             task_id, _ = await (
                 managed_job_state.get_latest_task_id_status_async(job_id))
             assert task_id is not None, job_id
-            job_logger.info(f'Cancelling managed job, job_id: {job_id}, '
-                            f'task_id: {task_id}')
+            logger.info(f'Cancelling managed job, job_id: {job_id}, '
+                        f'task_id: {task_id}')
             await managed_job_state.set_cancelling_async(
                 job_id=job_id,
                 callback_func=managed_job_utils.event_callback_func(
@@ -999,16 +989,13 @@ class Controller:
             cancelling = True
             raise
         except Exception as e:
-            job_logger.error(f'Unexpected error in job loop for {job_id}: '
-                             f'{common_utils.format_exception(e)}')
+            logger.error(f'Unexpected error in job loop for {job_id}: '
+                         f'{common_utils.format_exception(e)}')
             raise
         finally:
             try:
-                await self._cleanup(job_id,
-                                    dag_yaml=dag_yaml,
-                                    job_logger=job_logger,
-                                    pool=pool)
-                job_logger.info(
+                await self._cleanup(job_id, pool=pool)
+                logger.info(
                     f'Cluster of managed job {job_id} has been cleaned up.')
             except Exception as e:  # pylint: disable=broad-except
                 failure_reason = ('Failed to clean up: '
@@ -1037,7 +1024,7 @@ class Controller:
             # The job can be non-terminal if the controller exited abnormally,
             # e.g. failed to launch cluster after reaching the MAX_RETRY.
             if not job_status.is_terminal():
-                job_logger.info(f'Previous job status: {job_status.value}')
+                logger.info(f'Previous job status: {job_status.value}')
                 await managed_job_state.set_failed_async(
                     job_id,
                     task_id=None,
@@ -1069,48 +1056,25 @@ class Controller:
     async def start_job(
         self,
         job_id: int,
-        dag_yaml: str,
-        env_file_path: Optional[str] = None,
         pool: Optional[str] = None,
     ):
         """Start a new job.
         Args:
             job_id: The ID of the job to start.
-            dag_yaml: Path to the YAML file containing the DAG definition.
-            env_file_path: Optional path to environment file for the job.
         """
-        # Create a job-specific logger
+        # Create log file path for job output redirection
         log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
         os.makedirs(log_dir, exist_ok=True)
         log_file = os.path.join(log_dir, f'{job_id}.log')
-        job_logger = logging.getLogger(f'sky.jobs.{job_id}')
-        job_logger.setLevel(logging.DEBUG)
-        # Create file handler
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setLevel(logging.DEBUG)
-        # Use Sky's standard formatter
-        file_handler.setFormatter(sky_logging.FORMATTER)
-        # Add the handler to the logger
-        job_logger.addHandler(file_handler)
-        # Prevent log propagation to avoid duplicate logs
-        job_logger.propagate = False
-        job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
-                        f'env_file_path={env_file_path}')
+        logger.info(f'Starting job {job_id} with log_file={log_file}')
         async with self._job_tasks_lock:
             self.starting.add(job_id)
-        await create_background_task(
-            self.run_job_loop(job_id, dag_yaml, job_logger, log_file,
-                              env_file_path, pool))
+        await create_background_task(self.run_job_loop(job_id, log_file, pool))
-        job_logger.info(f'Job {job_id} started successfully')
+        logger.info(f'Job {job_id} started successfully')
     async def cancel_job(self):
         """Cancel an existing job."""
@@ -1161,6 +1125,7 @@ class Controller:
                             scheduler.get_number_of_controllers()))
             if len(running_tasks) >= max_jobs:
+                logger.info('Too many jobs running, waiting for 60 seconds')
                 await asyncio.sleep(60)
                 continue
@@ -1174,12 +1139,12 @@ class Controller:
                 continue
             if waiting_job is None:
+                logger.info('No waiting job, waiting for 10 seconds')
                 await asyncio.sleep(10)
                 continue
+            logger.info(f'Claiming job {waiting_job["job_id"]}')
             job_id = waiting_job['job_id']
-            dag_yaml_path = waiting_job['dag_yaml_path']
-            env_file_path = waiting_job.get('env_file_path')
             pool = waiting_job.get('pool', None)
             cancels = os.listdir(jobs_constants.CONSOLIDATED_SIGNAL_PATH)
@@ -1199,13 +1164,15 @@ class Controller:
                             job_id=job_id, task_id=None, task=None))
                     continue
-            await self.start_job(job_id, dag_yaml_path, env_file_path, pool)
+            await self.start_job(job_id, pool)
+async def main(controller_uuid: str):
+    logger.info(f'Starting controller {controller_uuid}')
-async def main():
     context_utils.hijack_sys_attrs()
-    controller = Controller()
+    controller = ControllerManager(controller_uuid)
     # Will happen multiple times, who cares though
     os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
@@ -1214,6 +1181,8 @@ async def main():
     soft = None
     try:
         soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+        logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
+        logger.info(f'Increasing soft limit to {hard}')
         resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
     except OSError as e:
         logger.warning(f'Failed to increase number of files we can open: {e}\n'
@@ -1222,7 +1191,10 @@ async def main():
     # Will loop forever, do it in the background
     cancel_job_task = asyncio.create_task(controller.cancel_job())
     monitor_loop_task = asyncio.create_task(controller.monitor_loop())
+    # Run the garbage collector in a dedicated daemon thread to avoid affecting
+    # the main event loop.
+    gc_thread = threading.Thread(target=log_gc.elect_for_log_gc, daemon=True)
+    gc_thread.start()
     try:
         await asyncio.gather(cancel_job_task, monitor_loop_task)
     except Exception as e:  # pylint: disable=broad-except
@@ -1231,4 +1203,4 @@ async def main():
 if __name__ == '__main__':
-    asyncio.run(main())
+    asyncio.run(main(sys.argv[1]))

skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl