PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251012py3-none-any.whl → 1.0.0.dev20251014py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (63) hide show

sky/__init__.py +4 -2
sky/adaptors/shadeform.py +89 -0
sky/authentication.py +52 -2
sky/backends/backend_utils.py +35 -25
sky/backends/cloud_vm_ray_backend.py +5 -5
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +19 -25
sky/catalog/shadeform_catalog.py +165 -0
sky/client/cli/command.py +53 -19
sky/client/sdk.py +13 -1
sky/clouds/__init__.py +2 -0
sky/clouds/shadeform.py +393 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +122 -145
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +5 -5
sky/jobs/state.py +65 -21
sky/jobs/utils.py +58 -22
sky/metrics/utils.py +27 -6
sky/provision/__init__.py +1 -0
sky/provision/kubernetes/utils.py +44 -39
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/server/common.py +4 -2
sky/server/requests/executor.py +25 -3
sky/server/server.py +9 -3
sky/setup_files/dependencies.py +1 -0
sky/sky_logging.py +0 -2
sky/skylet/constants.py +23 -6
sky/skylet/log_lib.py +0 -1
sky/skylet/log_lib.pyi +1 -1
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/utils/common.py +2 -0
sky/utils/context.py +57 -51
sky/utils/context_utils.py +15 -11
sky/utils/controller_utils.py +35 -8
sky/utils/locks.py +20 -5
sky/utils/subprocess_utils.py +4 -3
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
/sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0

sky/jobs/controller.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Controller: handles scheduling and the life cycle of a managed job.
 """
 import asyncio
-import logging
 import os
 import pathlib
 import resource
@@ -95,7 +94,6 @@ class JobsController:
     - ``_dag_yaml`` / ``_dag`` / ``_dag_name``: The job definition and metadata.
     - ``_backend``: Backend used to launch and manage clusters.
     - ``_pool``: Optional pool name if using a cluster pool.
-    - ``_logger``: Job-scoped logger for progress and diagnostics.
     - ``starting`` / ``starting_lock`` / ``starting_signal``: Shared scheduler
       coordination primitives. ``starting_lock`` must be used for accessing
       ``starting_signal`` and ``starting``
@@ -107,7 +105,6 @@ class JobsController:
         self,
         job_id: int,
         dag_yaml: str,
-        job_logger: logging.Logger,
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
@@ -118,7 +115,6 @@ class JobsController:
         Args:
             job_id: Integer ID of the managed job.
             dag_yaml: Path to the YAML file containing the chain DAG to run.
-            job_logger: Logger instance dedicated to this job.
             starting: Shared set of job IDs currently in the STARTING phase,
                 used to limit concurrent launches.
             starting_lock: ``asyncio.Lock`` guarding access to the shared
@@ -134,14 +130,13 @@ class JobsController:
         self.starting_lock = starting_lock
         self.starting_signal = starting_signal
-        self._logger = job_logger
-        self._logger.info(f'Initializing JobsController for job_id={job_id}, '
-                          f'dag_yaml={dag_yaml}')
+        logger.info(f'Initializing JobsController for job_id={job_id}, '
+                    f'dag_yaml={dag_yaml}')
         self._job_id = job_id
         self._dag_yaml = dag_yaml
         self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
-        self._logger.info(f'Loaded DAG: {self._dag}')
+        logger.info(f'Loaded DAG: {self._dag}')
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
         self._pool = pool
@@ -191,8 +186,8 @@ class JobsController:
         preemptions or ssh disconnection during the streaming.
         """
         if handle is None:
-            self._logger.info(f'Cluster for job {self._job_id} is not found. '
-                              'Skipping downloading and streaming the logs.')
+            logger.info(f'Cluster for job {self._job_id} is not found. '
+                        'Skipping downloading and streaming the logs.')
             return
         managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -210,11 +205,11 @@ class JobsController:
             managed_job_state.set_local_log_file(self._job_id, task_id,
                                                  log_file)
         else:
-            self._logger.warning(
+            logger.warning(
                 f'No log file was downloaded for job {self._job_id}, '
                 f'task {task_id}')
-        self._logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
+        logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
     async def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
         if cluster_name is None:
@@ -259,7 +254,7 @@ class JobsController:
         Other exceptions may be raised depending on the backend.
         """
         task_start_time = time.time()
-        self._logger.info(
+        logger.info(
             f'Starting task {task_id} ({task.name}) for job {self._job_id}')
         latest_task_id, last_task_prev_status = (
@@ -271,22 +266,20 @@ class JobsController:
                 managed_job_state.ManagedJobStatus.PENDING):
             assert latest_task_id >= task_id, (latest_task_id, task_id)
             if latest_task_id > task_id:
-                self._logger.info(f'Task {task_id} ({task.name}) has already '
-                                  'been executed. Skipping...')
+                logger.info(f'Task {task_id} ({task.name}) has already '
+                            'been executed. Skipping...')
                 return True
             if latest_task_id == task_id:
                 # Start recovery.
                 is_resume = True
-                self._logger.info(
-                    f'Resuming task {task_id} from previous execution')
+                logger.info(f'Resuming task {task_id} from previous execution')
         callback_func = managed_job_utils.event_callback_func(
             job_id=self._job_id, task_id=task_id, task=task)
         if task.run is None:
-            self._logger.info(
-                f'Skip running task {task_id} ({task.name}) due to its '
-                'run commands being empty.')
+            logger.info(f'Skip running task {task_id} ({task.name}) due to its '
+                        'run commands being empty.')
             # Call set_started first to initialize columns in the state table,
             # including start_at and last_recovery_at to avoid issues for
             # uninitialized columns.
@@ -300,8 +293,7 @@ class JobsController:
                 task_id=task_id,
                 end_time=time.time(),
                 callback_func=callback_func)
-            self._logger.info(
-                f'Empty task {task_id} marked as succeeded immediately')
+            logger.info(f'Empty task {task_id} marked as succeeded immediately')
             return True
         usage_lib.messages.usage.update_task_id(task_id)
@@ -314,8 +306,7 @@ class JobsController:
             task.name, self._job_id) if self._pool is None else None
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
             cluster_name, self._backend, task, self._job_id, task_id,
-            self._logger, self._pool, self.starting, self.starting_lock,
-            self.starting_signal)
+            self._pool, self.starting, self.starting_lock, self.starting_signal)
         if not is_resume:
             submitted_at = time.time()
             if task_id == 0:
@@ -336,11 +327,11 @@ class JobsController:
                         self._strategy_executor.max_restarts_on_errors
                 },
                 callback_func=callback_func)
-            self._logger.info(f'Submitted managed job {self._job_id} '
-                              f'(task: {task_id}, name: {task.name!r}); '
-                              f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
+            logger.info(f'Submitted managed job {self._job_id} '
+                        f'(task: {task_id}, name: {task.name!r}); '
+                        f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
-        self._logger.info('Started monitoring.')
+        logger.info('Started monitoring.')
         # Only do the initial cluster launch if not resuming from a controller
         # failure. Otherwise, we will transit to recovering immediately.
@@ -354,7 +345,7 @@ class JobsController:
             remote_job_submitted_at = await self._strategy_executor.launch()
             launch_time = time.time() - launch_start
-            self._logger.info(f'Cluster launch completed in {launch_time:.2f}s')
+            logger.info(f'Cluster launch completed in {launch_time:.2f}s')
             assert remote_job_submitted_at is not None, remote_job_submitted_at
         if self._pool is None:
             job_id_on_pool_cluster = None
@@ -367,16 +358,16 @@ class JobsController:
             # Check if we have been cancelled here, in the case where a user
             # quickly cancels the job we want to gracefully handle it here,
             # otherwise we will end up in the FAILED_CONTROLLER state.
-            self._logger.info(f'Cluster name is None for job {self._job_id}, '
-                              f'task {task_id}. Checking if we have been '
-                              'cancelled.')
+            logger.info(f'Cluster name is None for job {self._job_id}, '
+                        f'task {task_id}. Checking if we have been '
+                        'cancelled.')
             status = await (managed_job_state.get_job_status_with_task_id_async(
                 job_id=self._job_id, task_id=task_id))
-            self._logger.debug(f'Status for job {self._job_id}, task {task_id}:'
-                               f'{status}')
+            logger.debug(f'Status for job {self._job_id}, task {task_id}:'
+                         f'{status}')
             if status == managed_job_state.ManagedJobStatus.CANCELLED:
-                self._logger.info(f'Job {self._job_id}, task {task_id} has '
-                                  'been quickly cancelled.')
+                logger.info(f'Job {self._job_id}, task {task_id} has '
+                            'been quickly cancelled.')
                 raise asyncio.CancelledError()
         assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
@@ -417,7 +408,7 @@ class JobsController:
                 if prev_status is not None:
                     if prev_status.is_terminal():
-                        self._logger.info(
+                        logger.info(
                             f'Task {task_id} already in terminal state: '
                             f'{prev_status}')
                         return (prev_status ==
@@ -427,9 +418,8 @@ class JobsController:
                         # If the controller is down when cancelling the job,
                         # we re-raise the error to run the `_cleanup` function
                         # again to clean up any remaining resources.
-                        self._logger.info(
-                            f'Task {task_id} was being cancelled, '
-                            're-raising cancellation')
+                        logger.info(f'Task {task_id} was being cancelled, '
+                                    're-raising cancellation')
                         raise asyncio.CancelledError()
                 if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
                     force_transit_to_recovering = True
@@ -443,10 +433,9 @@ class JobsController:
             try:
                 await backend_utils.async_check_network_connection()
             except exceptions.NetworkError:
-                self._logger.info(
-                    'Network is not available. Retrying again in '
-                    f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
-                    'seconds.')
+                logger.info('Network is not available. Retrying again in '
+                            f'{managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS} '
+                            'seconds.')
                 continue
             # NOTE: we do not check cluster status first because race condition
@@ -461,23 +450,22 @@ class JobsController:
                         self._backend,
                         cluster_name,
                         job_id=job_id_on_pool_cluster,
-                        job_logger=self._logger,
                     )
                 except exceptions.FetchClusterInfoError as fetch_e:
-                    self._logger.info(
+                    logger.info(
                         'Failed to fetch the job status. Start recovery.\n'
                         f'Exception: {common_utils.format_exception(fetch_e)}\n'
                         f'Traceback: {traceback.format_exc()}')
             if job_status == job_lib.JobStatus.SUCCEEDED:
-                self._logger.info(f'Task {task_id} succeeded! '
-                                  'Getting end time and cleaning up')
+                logger.info(f'Task {task_id} succeeded! '
+                            'Getting end time and cleaning up')
                 try:
                     success_end_time = await context_utils.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
-                    self._logger.warning(
+                    logger.warning(
                         f'Failed to get job end time: '
                         f'{common_utils.format_exception(e)}',
                         exc_info=True)
@@ -490,7 +478,7 @@ class JobsController:
                     task_id,
                     end_time=success_end_time,
                     callback_func=callback_func)
-                self._logger.info(
+                logger.info(
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
                 try:
@@ -511,7 +499,7 @@ class JobsController:
                             job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
                     # We don't want to crash here, so just log and continue.
-                    self._logger.warning(
+                    logger.warning(
                         f'Failed to download and stream logs: '
                         f'{common_utils.format_exception(e)}',
                         exc_info=True)
@@ -521,10 +509,10 @@ class JobsController:
                 task_total_time = time.time() - task_start_time
                 monitoring_time = time.time() - monitoring_start_time
-                self._logger.info(f'Task {task_id} completed successfully in '
-                                  f'{task_total_time:.2f}s '
-                                  f'(monitoring time: {monitoring_time:.2f}s, '
-                                  f'status checks: {status_check_count})')
+                logger.info(f'Task {task_id} completed successfully in '
+                            f'{task_total_time:.2f}s '
+                            f'(monitoring time: {monitoring_time:.2f}s, '
+                            f'status checks: {status_check_count})')
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -560,7 +548,7 @@ class JobsController:
                 # code).
                 cluster_status_str = ('' if cluster_status is None else
                                       f' (status: {cluster_status.value})')
-                self._logger.info(
+                logger.info(
                     f'Cluster is preempted or failed{cluster_status_str}. '
                     'Recovering...')
             else:
@@ -571,12 +559,12 @@ class JobsController:
                       in job_lib.JobStatus.user_code_failure_states() or
                       job_status == job_lib.JobStatus.FAILED_DRIVER):
                     # The user code has probably crashed, fail immediately.
-                    self._logger.info(
+                    logger.info(
                         f'Task {task_id} failed with status: {job_status}')
                     end_time = await context_utils.to_thread(
                         managed_job_utils.try_to_get_job_end_time,
                         self._backend, cluster_name, job_id_on_pool_cluster)
-                    self._logger.info(
+                    logger.info(
                         f'The user job failed ({job_status}). Please check the '
                         'logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
@@ -611,7 +599,7 @@ class JobsController:
                     if should_restart_on_failure:
                         max_restarts = (
                             self._strategy_executor.max_restarts_on_errors)
-                        self._logger.info(
+                        logger.info(
                             f'User program crashed '
                             f'({managed_job_status.value}). '
                             f'Retry the job as max_restarts_on_errors is '
@@ -619,7 +607,7 @@ class JobsController:
                             f'[{self._strategy_executor.restart_cnt_on_failure}'
                             f'/{max_restarts}]')
                     else:
-                        self._logger.info(
+                        logger.info(
                             f'Task {task_id} failed and will not be retried')
                         await managed_job_state.set_failed_async(
                             self._job_id,
@@ -632,7 +620,7 @@ class JobsController:
                 elif job_status is not None:
                     # Either the job is cancelled (should not happen) or in some
                     # unknown new state that we do not handle.
-                    self._logger.error(f'Unknown job status: {job_status}')
+                    logger.error(f'Unknown job status: {job_status}')
                     failure_reason = (
                         f'Unknown job status {job_status}. To see the details, '
                         f'run: sky jobs logs --controller {self._job_id}')
@@ -649,10 +637,9 @@ class JobsController:
                     # job status. Try to recover the job (will not restart the
                     # cluster, if the cluster is healthy).
                     assert job_status is None, job_status
-                    self._logger.info(
-                        'Failed to fetch the job status while the '
-                        'cluster is healthy. Try to recover the job '
-                        '(the cluster will not be restarted).')
+                    logger.info('Failed to fetch the job status while the '
+                                'cluster is healthy. Try to recover the job '
+                                '(the cluster will not be restarted).')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources
@@ -671,15 +658,14 @@ class JobsController:
                     # Some spot resource (e.g., Spot TPU VM) may need to be
                     # cleaned up after preemption, as running launch again on
                     # those clusters again may fail.
-                    self._logger.info(
-                        'Cleaning up the preempted or failed cluster'
-                        '...')
+                    logger.info('Cleaning up the preempted or failed cluster'
+                                '...')
                     await self._cleanup_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
-            self._logger.info(f'Starting recovery for task {task_id}, '
-                              f'it is currently {job_status}')
+            logger.info(f'Starting recovery for task {task_id}, '
+                        f'it is currently {job_status}')
             await managed_job_state.set_recovering_async(
                 job_id=self._job_id,
                 task_id=task_id,
@@ -701,7 +687,7 @@ class JobsController:
     async def run(self):
         """Run controller logic and handle exceptions."""
-        self._logger.info(f'Starting JobsController run for job {self._job_id}')
+        logger.info(f'Starting JobsController run for job {self._job_id}')
         task_id = 0
         cancelled = False
@@ -709,39 +695,36 @@ class JobsController:
             succeeded = True
             # We support chain DAGs only for now.
             for task_id, task in enumerate(self._dag.tasks):
-                self._logger.info(
+                logger.info(
                     f'Processing task {task_id}/{len(self._dag.tasks)-1}: '
                     f'{task.name}')
                 task_start = time.time()
                 succeeded = await self._run_one_task(task_id, task)
                 task_time = time.time() - task_start
-                self._logger.info(
-                    f'Task {task_id} completed in {task_time:.2f}s '
-                    f'with success={succeeded}')
+                logger.info(f'Task {task_id} completed in {task_time:.2f}s '
+                            f'with success={succeeded}')
                 if not succeeded:
-                    self._logger.info(
-                        f'Task {task_id} failed, stopping execution')
+                    logger.info(f'Task {task_id} failed, stopping execution')
                     break
         except exceptions.ProvisionPrechecksError as e:
             # Please refer to the docstring of self._run for the cases when
             # this exception can occur.
-            self._logger.error(f'Provision prechecks failed for task {task_id}')
+            logger.error(f'Provision prechecks failed for task {task_id}')
             failure_reason = ('; '.join(
                 common_utils.format_exception(reason, use_bracket=True)
                 for reason in e.reasons))
-            self._logger.error(failure_reason)
+            logger.error(failure_reason)
             await self._update_failed_task_state(
                 task_id, managed_job_state.ManagedJobStatus.FAILED_PRECHECKS,
                 failure_reason)
         except exceptions.ManagedJobReachedMaxRetriesError as e:
             # Please refer to the docstring of self._run for the cases when
             # this exception can occur.
-            self._logger.error(
-                f'Managed job reached max retries for task {task_id}')
+            logger.error(f'Managed job reached max retries for task {task_id}')
             failure_reason = common_utils.format_exception(e)
-            self._logger.error(failure_reason)
+            logger.error(failure_reason)
             # The managed job should be marked as FAILED_NO_RESOURCE, as the
             # managed job may be able to launch next time.
             await self._update_failed_task_state(
@@ -753,13 +736,13 @@ class JobsController:
             cancelled = True
             raise
         except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-            self._logger.error(
+            logger.error(
                 f'Unexpected error in JobsController run for task {task_id}')
             with ux_utils.enable_traceback():
-                self._logger.error(traceback.format_exc())
+                logger.error(traceback.format_exc())
             msg = ('Unexpected error occurred: ' +
                    common_utils.format_exception(e, use_bracket=True))
-            self._logger.error(msg)
+            logger.error(msg)
             await self._update_failed_task_state(
                 task_id, managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
                 msg)
@@ -783,8 +766,8 @@ class JobsController:
             failure_type: managed_job_state.ManagedJobStatus,
             failure_reason: str):
         """Update the state of the failed task."""
-        self._logger.info(f'Updating failed task state: task_id={task_id}, '
-                          f'failure_type={failure_type}')
+        logger.info(f'Updating failed task state: task_id={task_id}, '
+                    f'failure_type={failure_type}')
         await managed_job_state.set_failed_async(
             self._job_id,
             task_id=task_id,
@@ -799,7 +782,8 @@ class JobsController:
 class Controller:
     """Controller for managing jobs."""
-    def __init__(self) -> None:
+    def __init__(self, controller_uuid: str) -> None:
+        self._controller_uuid = controller_uuid
         # Global state for active jobs
         self.job_tasks: Dict[int, asyncio.Task] = {}
         self.starting: Set[int] = set()
@@ -813,10 +797,11 @@ class Controller:
         # launch).
         self._starting_signal = asyncio.Condition(lock=self._job_tasks_lock)
+        self._pid = os.getpid()
     async def _cleanup(self,
                        job_id: int,
                        dag_yaml: str,
-                       job_logger: logging.Logger,
                        pool: Optional[str] = None):
         """Clean up the cluster(s) and storages.
@@ -842,14 +827,13 @@ class Controller:
                     cluster_name = (
                         managed_job_utils.generate_managed_job_cluster_name(
                             task.name, job_id))
-                    managed_job_utils.terminate_cluster(cluster_name,
-                                                        _logger=job_logger)
+                    managed_job_utils.terminate_cluster(cluster_name)
                     status = core.status(cluster_names=[cluster_name],
                                          all_users=True)
                     assert (len(status) == 0 or
                             status[0]['status'] == sky.ClusterStatus.STOPPED), (
                                 f'{cluster_name} is not down: {status}')
-                    job_logger.info(f'{cluster_name} is down')
+                    logger.info(f'{cluster_name} is down')
                 else:
                     cluster_name, job_id_on_pool_cluster = (
                         managed_job_state.get_pool_submit_info(job_id))
@@ -860,7 +844,7 @@ class Controller:
                                         _try_cancel_if_cluster_is_init=True)
             except Exception as e:  # pylint: disable=broad-except
                 error = e
-                job_logger.warning(
+                logger.warning(
                     f'Failed to terminate cluster {cluster_name}: {e}')
                 # we continue to try cleaning up whatever else we can.
             # Clean up Storages with persistent=False.
@@ -874,7 +858,7 @@ class Controller:
                 for storage in task.storage_mounts.values():
                     storage.construct()
             except (exceptions.StorageSpecError, exceptions.StorageError) as e:
-                job_logger.warning(
+                logger.warning(
                     f'Failed to construct storage object for teardown: {e}\n'
                     'This may happen because storage construction already '
                     'failed during launch, storage was deleted externally, '
@@ -884,7 +868,7 @@ class Controller:
                 backend.teardown_ephemeral_storage(task)
             except Exception as e:  # pylint: disable=broad-except
                 error = e
-                job_logger.warning(f'Failed to teardown ephemeral storage: {e}')
+                logger.warning(f'Failed to teardown ephemeral storage: {e}')
                 # we continue to try cleaning up whatever else we can.
             # Clean up any files mounted from the local disk, such as two-hop
@@ -902,7 +886,7 @@ class Controller:
                         else:
                             os.remove(path)
                 except Exception as e:  # pylint: disable=broad-except
-                    job_logger.warning(
+                    logger.warning(
                         f'Failed to clean up file mount {file_mount}: {e}')
             if error is not None:
@@ -924,11 +908,10 @@ class Controller:
     # Use context.contextual to enable per-job output redirection and env var
     # isolation.
-    @context.contextual
+    @context.contextual_async
     async def run_job_loop(self,
                            job_id: int,
                            dag_yaml: str,
-                           job_logger: logging.Logger,
                            log_file: str,
                            env_file_path: Optional[str] = None,
                            pool: Optional[str] = None):
@@ -937,45 +920,52 @@ class Controller:
         assert ctx is not None, 'Context is not initialized'
         ctx.redirect_log(pathlib.Path(log_file))
+        logger.info(f'Starting job loop for {job_id}')
+        logger.info(f'  dag_yaml={dag_yaml}')
+        logger.info(f'  log_file={log_file}')
+        logger.info(f'  env_file_path={env_file_path}')
+        logger.info(f'  pool={pool}')
+        logger.info(f'From controller {self._controller_uuid}')
+        logger.info(f'  pid={self._pid}')
         # Load and apply environment variables from the job's environment file
         if env_file_path and os.path.exists(env_file_path):
             try:
                 # Load environment variables from the file
                 env_vars = dotenv.dotenv_values(env_file_path)
-                job_logger.info(f'Loading environment from {env_file_path}: '
-                                f'{list(env_vars.keys())}')
+                logger.info(f'Loading environment from {env_file_path}: '
+                            f'{list(env_vars.keys())}')
                 # Apply environment variables to the job's context
                 if ctx is not None:
                     for key, value in env_vars.items():
                         if value is not None:
                             ctx.override_envs({key: value})
-                            job_logger.debug(
+                            logger.debug(
                                 f'Set environment variable: {key}={value}')
                     # Reload the skypilot config for this context to make sure
                     # the latest config is used.
                     skypilot_config.reload_config()
                 else:
-                    job_logger.error(
+                    logger.error(
                         'Context is None, cannot set environment variables')
             except Exception as e:  # pylint: disable=broad-except
-                job_logger.error(
+                logger.error(
                     f'Failed to load environment file {env_file_path}: {e}')
         elif env_file_path:
-            job_logger.error(f'Environment file not found: {env_file_path}')
+            logger.error(f'Environment file not found: {env_file_path}')
         cancelling = False
         try:
-            job_logger.info(f'Starting job loop for {job_id}')
+            logger.info(f'Starting job loop for {job_id}')
-            controller = JobsController(job_id, dag_yaml, job_logger,
-                                        self.starting, self._job_tasks_lock,
+            controller = JobsController(job_id, dag_yaml, self.starting,
+                                        self._job_tasks_lock,
                                         self._starting_signal, pool)
             async with self._job_tasks_lock:
                 if job_id in self.job_tasks:
-                    job_logger.error(
-                        f'Job {job_id} already exists in job_tasks')
+                    logger.error(f'Job {job_id} already exists in job_tasks')
                     raise ValueError(f'Job {job_id} already exists')
                 # Create the task and store it
@@ -985,13 +975,13 @@ class Controller:
                 self.job_tasks[job_id] = task
             await task
         except asyncio.CancelledError:
-            job_logger.info(f'Job {job_id} was cancelled')
+            logger.info(f'Job {job_id} was cancelled')
             dag, _ = _get_dag_and_name(dag_yaml)
             task_id, _ = await (
                 managed_job_state.get_latest_task_id_status_async(job_id))
             assert task_id is not None, job_id
-            job_logger.info(f'Cancelling managed job, job_id: {job_id}, '
-                            f'task_id: {task_id}')
+            logger.info(f'Cancelling managed job, job_id: {job_id}, '
+                        f'task_id: {task_id}')
             await managed_job_state.set_cancelling_async(
                 job_id=job_id,
                 callback_func=managed_job_utils.event_callback_func(
@@ -999,16 +989,13 @@ class Controller:
             cancelling = True
             raise
         except Exception as e:
-            job_logger.error(f'Unexpected error in job loop for {job_id}: '
-                             f'{common_utils.format_exception(e)}')
+            logger.error(f'Unexpected error in job loop for {job_id}: '
+                         f'{common_utils.format_exception(e)}')
             raise
         finally:
             try:
-                await self._cleanup(job_id,
-                                    dag_yaml=dag_yaml,
-                                    job_logger=job_logger,
-                                    pool=pool)
-                job_logger.info(
+                await self._cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
+                logger.info(
                     f'Cluster of managed job {job_id} has been cleaned up.')
             except Exception as e:  # pylint: disable=broad-except
                 failure_reason = ('Failed to clean up: '
@@ -1037,7 +1024,7 @@ class Controller:
             # The job can be non-terminal if the controller exited abnormally,
             # e.g. failed to launch cluster after reaching the MAX_RETRY.
             if not job_status.is_terminal():
-                job_logger.info(f'Previous job status: {job_status.value}')
+                logger.info(f'Previous job status: {job_status.value}')
                 await managed_job_state.set_failed_async(
                     job_id,
                     task_id=None,
@@ -1080,37 +1067,20 @@ class Controller:
             dag_yaml: Path to the YAML file containing the DAG definition.
             env_file_path: Optional path to environment file for the job.
         """
-        # Create a job-specific logger
+        # Create log file path for job output redirection
         log_dir = os.path.expanduser(jobs_constants.JOBS_CONTROLLER_LOGS_DIR)
         os.makedirs(log_dir, exist_ok=True)
         log_file = os.path.join(log_dir, f'{job_id}.log')
-        job_logger = logging.getLogger(f'sky.jobs.{job_id}')
-        job_logger.setLevel(logging.DEBUG)
-        # Create file handler
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setLevel(logging.DEBUG)
-        # Use Sky's standard formatter
-        file_handler.setFormatter(sky_logging.FORMATTER)
-        # Add the handler to the logger
-        job_logger.addHandler(file_handler)
-        # Prevent log propagation to avoid duplicate logs
-        job_logger.propagate = False
-        job_logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
-                        f'env_file_path={env_file_path}')
+        logger.info(f'Starting job {job_id} with dag_yaml={dag_yaml}, '
+                    f'env_file_path={env_file_path}, and log_file={log_file}')
         async with self._job_tasks_lock:
             self.starting.add(job_id)
         await create_background_task(
-            self.run_job_loop(job_id, dag_yaml, job_logger, log_file,
-                              env_file_path, pool))
+            self.run_job_loop(job_id, dag_yaml, log_file, env_file_path, pool))
-        job_logger.info(f'Job {job_id} started successfully')
+        logger.info(f'Job {job_id} started successfully')
     async def cancel_job(self):
         """Cancel an existing job."""
@@ -1161,6 +1131,7 @@ class Controller:
                             scheduler.get_number_of_controllers()))
             if len(running_tasks) >= max_jobs:
+                logger.info('Too many jobs running, waiting for 60 seconds')
                 await asyncio.sleep(60)
                 continue
@@ -1174,9 +1145,11 @@ class Controller:
                 continue
             if waiting_job is None:
+                logger.info('No waiting job, waiting for 10 seconds')
                 await asyncio.sleep(10)
                 continue
+            logger.info(f'Claiming job {waiting_job["job_id"]}')
             job_id = waiting_job['job_id']
             dag_yaml_path = waiting_job['dag_yaml_path']
             env_file_path = waiting_job.get('env_file_path')
@@ -1202,10 +1175,12 @@ class Controller:
             await self.start_job(job_id, dag_yaml_path, env_file_path, pool)
-async def main():
+async def main(controller_uuid: str):
+    logger.info(f'Starting controller {controller_uuid}')
     context_utils.hijack_sys_attrs()
-    controller = Controller()
+    controller = Controller(controller_uuid)
     # Will happen multiple times, who cares though
     os.makedirs(jobs_constants.CONSOLIDATED_SIGNAL_PATH, exist_ok=True)
@@ -1214,6 +1189,8 @@ async def main():
     soft = None
     try:
         soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+        logger.info(f'Current rlimits for NOFILE: soft={soft}, hard={hard}')
+        logger.info(f'Increasing soft limit to {hard}')
         resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
     except OSError as e:
         logger.warning(f'Failed to increase number of files we can open: {e}\n'
@@ -1231,4 +1208,4 @@ async def main():
 if __name__ == '__main__':
-    asyncio.run(main())
+    asyncio.run(main(sys.argv[1]))

skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251012py3-none-any.whl → 1.0.0.dev20251014py3-none-any.whl