PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (72) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +4 -1
sky/backends/cloud_vm_ray_backend.py +4 -3
sky/catalog/__init__.py +3 -3
sky/catalog/aws_catalog.py +12 -0
sky/catalog/common.py +2 -2
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +448 -53
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/__init__.py +3 -0
sky/jobs/client/sdk.py +80 -3
sky/jobs/controller.py +76 -25
sky/jobs/recovery_strategy.py +80 -34
sky/jobs/scheduler.py +68 -20
sky/jobs/server/core.py +228 -136
sky/jobs/server/server.py +40 -0
sky/jobs/state.py +129 -24
sky/jobs/utils.py +109 -51
sky/provision/nebius/constants.py +3 -0
sky/py.typed +0 -0
sky/resources.py +16 -12
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/serve/autoscalers.py +8 -0
sky/serve/client/impl.py +188 -0
sky/serve/client/sdk.py +12 -82
sky/serve/constants.py +5 -1
sky/serve/controller.py +5 -0
sky/serve/replica_managers.py +112 -37
sky/serve/serve_state.py +16 -6
sky/serve/serve_utils.py +274 -77
sky/serve/server/core.py +8 -525
sky/serve/server/impl.py +709 -0
sky/serve/service.py +13 -9
sky/serve/service_spec.py +74 -4
sky/server/constants.py +1 -1
sky/server/requests/payloads.py +33 -0
sky/server/requests/requests.py +18 -1
sky/server/requests/serializers/decoders.py +12 -3
sky/server/requests/serializers/encoders.py +13 -2
sky/skylet/events.py +9 -0
sky/skypilot_config.py +24 -21
sky/task.py +41 -11
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/sky-serve-controller.yaml.j2 +18 -2
sky/users/server.py +1 -1
sky/utils/command_runner.py +4 -2
sky/utils/controller_utils.py +14 -10
sky/utils/dag_utils.py +4 -2
sky/utils/db/migration_utils.py +2 -4
sky/utils/schemas.py +24 -19
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -100,6 +100,13 @@ job_info_table = sqlalchemy.Table(
     sqlalchemy.Column('original_user_yaml_path',
                       sqlalchemy.Text,
                       server_default=None),
+    sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('current_cluster_name',
+                      sqlalchemy.Text,
+                      server_default=None),
+    sqlalchemy.Column('job_id_on_pool_cluster',
+                      sqlalchemy.Integer,
+                      server_default=None),
 )
 ha_recovery_script_table = sqlalchemy.Table(
@@ -215,6 +222,9 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
         'priority': r['priority'],
         'entrypoint': r['entrypoint'],
         'original_user_yaml_path': r['original_user_yaml_path'],
+        'pool': r['pool'],
+        'current_cluster_name': r['current_cluster_name'],
+        'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
     }
@@ -451,8 +461,8 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
 @_init_db
-def set_job_info_without_job_id(name: str, workspace: str,
-                                entrypoint: str) -> int:
+def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
+                                pool: Optional[str]) -> int:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -469,6 +479,7 @@ def set_job_info_without_job_id(name: str, workspace: str,
             schedule_state=ManagedJobScheduleState.INACTIVE.value,
             workspace=workspace,
             entrypoint=entrypoint,
+            pool=pool,
         )
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -1278,6 +1289,56 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
         return updated_count == 0
+@_init_db
+def get_pool_from_job_id(job_id: int) -> Optional[str]:
+    """Get the pool from the job id."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        pool = session.execute(
+            sqlalchemy.select(job_info_table.c.pool).where(
+                job_info_table.c.spot_job_id == job_id)).fetchone()
+        return pool[0] if pool else None
+@_init_db
+def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
+    """Set the current cluster name for a job."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        session.query(job_info_table).filter(
+            job_info_table.c.spot_job_id == job_id).update(
+                {job_info_table.c.current_cluster_name: current_cluster_name})
+        session.commit()
+@_init_db
+def set_job_id_on_pool_cluster(job_id: int,
+                               job_id_on_pool_cluster: int) -> None:
+    """Set the job id on the pool cluster for a job."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        session.query(job_info_table).filter(
+            job_info_table.c.spot_job_id == job_id).update({
+                job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
+            })
+        session.commit()
+@_init_db
+def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
+    """Get the cluster name and job id on the pool from the managed job id."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        info = session.execute(
+            sqlalchemy.select(
+                job_info_table.c.current_cluster_name,
+                job_info_table.c.job_id_on_pool_cluster).where(
+                    job_info_table.c.spot_job_id == job_id)).fetchone()
+        if info is None:
+            return None, None
+        return info[0], info[1]
 @_init_db
 def scheduler_set_launching(job_id: int,
                             current_state: ManagedJobScheduleState) -> None:
@@ -1398,28 +1459,68 @@ def get_num_launching_jobs() -> int:
             sqlalchemy.select(
                 sqlalchemy.func.count()  # pylint: disable=not-callable
             ).select_from(job_info_table).where(
-                job_info_table.c.schedule_state ==
-                ManagedJobScheduleState.LAUNCHING.value)).fetchone()[0]
+                sqlalchemy.and_(
+                    job_info_table.c.schedule_state ==
+                    ManagedJobScheduleState.LAUNCHING.value,
+                    # We only count jobs that are not in the pool, because the
+                    # job in the pool does not actually calling the sky.launch.
+                    job_info_table.c.pool.is_(None)))).fetchone()[0]
 @_init_db
-def get_num_alive_jobs() -> int:
+def get_num_alive_jobs(pool: Optional[str] = None) -> int:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        where_conditions = [
+            job_info_table.c.schedule_state.in_([
+                ManagedJobScheduleState.ALIVE_WAITING.value,
+                ManagedJobScheduleState.LAUNCHING.value,
+                ManagedJobScheduleState.ALIVE.value,
+                ManagedJobScheduleState.ALIVE_BACKOFF.value,
+            ])
+        ]
+        if pool is not None:
+            where_conditions.append(job_info_table.c.pool == pool)
         return session.execute(
             sqlalchemy.select(
                 sqlalchemy.func.count()  # pylint: disable=not-callable
             ).select_from(job_info_table).where(
-                job_info_table.c.schedule_state.in_([
-                    ManagedJobScheduleState.ALIVE_WAITING.value,
-                    ManagedJobScheduleState.LAUNCHING.value,
-                    ManagedJobScheduleState.ALIVE.value,
-                    ManagedJobScheduleState.ALIVE_BACKOFF.value,
-                ]))).fetchone()[0]
+                sqlalchemy.and_(*where_conditions))).fetchone()[0]
+@_init_db
+def get_nonterminal_job_ids_by_pool(pool: str,
+                                    cluster_name: Optional[str] = None
+                                   ) -> List[int]:
+    """Get nonterminal job ids in a pool."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        query = sqlalchemy.select(
+            spot_table.c.spot_job_id.distinct()).select_from(
+                spot_table.outerjoin(
+                    job_info_table,
+                    spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
+        and_conditions = [
+            ~spot_table.c.status.in_([
+                status.value for status in ManagedJobStatus.terminal_statuses()
+            ]),
+            job_info_table.c.pool == pool,
+        ]
+        if cluster_name is not None:
+            and_conditions.append(
+                job_info_table.c.current_cluster_name == cluster_name)
+        query = query.where(sqlalchemy.and_(*and_conditions)).order_by(
+            spot_table.c.spot_job_id.asc())
+        rows = session.execute(query).fetchall()
+        job_ids = [row[0] for row in rows if row[0] is not None]
+        return job_ids
 @_init_db
-def get_waiting_job() -> Optional[Dict[str, Any]]:
+def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
     """Get the next job that should transition to LAUNCHING.
     Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1442,23 +1543,26 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
                     ManagedJobScheduleState.ALIVE_BACKOFF.value,
                 ])).scalar_subquery()
         # Main query for waiting jobs
+        select_conds = [
+            job_info_table.c.schedule_state.in_([
+                ManagedJobScheduleState.WAITING.value,
+                ManagedJobScheduleState.ALIVE_WAITING.value,
+            ]),
+            job_info_table.c.priority >= sqlalchemy.func.coalesce(
+                max_priority_subquery, 0),
+        ]
+        if pool is not None:
+            select_conds.append(job_info_table.c.pool == pool)
         query = sqlalchemy.select(
             job_info_table.c.spot_job_id,
             job_info_table.c.schedule_state,
             job_info_table.c.dag_yaml_path,
             job_info_table.c.env_file_path,
-        ).where(
-            sqlalchemy.and_(
-                job_info_table.c.schedule_state.in_([
-                    ManagedJobScheduleState.WAITING.value,
-                    ManagedJobScheduleState.ALIVE_WAITING.value,
-                ]),
-                job_info_table.c.priority >= sqlalchemy.func.coalesce(
-                    max_priority_subquery, 0),
-            )).order_by(
-                job_info_table.c.priority.desc(),
-                job_info_table.c.spot_job_id.asc(),
-            ).limit(1)
+            job_info_table.c.pool,
+        ).where(sqlalchemy.and_(*select_conds)).order_by(
+            job_info_table.c.priority.desc(),
+            job_info_table.c.spot_job_id.asc(),
+        ).limit(1)
         waiting_job_row = session.execute(query).fetchone()
         if waiting_job_row is None:
             return None
@@ -1468,6 +1572,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
             'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
             'dag_yaml_path': waiting_job_row[2],
             'env_file_path': waiting_job_row[3],
+            'pool': waiting_job_row[4],
         }

sky/jobs/utils.py CHANGED Viewed

@@ -30,7 +30,6 @@ from sky.backends import backend_utils
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
-from sky.server import common as server_common
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
@@ -39,7 +38,6 @@ from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import controller_utils
-from sky.utils import env_options
 from sky.utils import infra_utils
 from sky.utils import log_utils
 from sky.utils import message_utils
@@ -136,12 +134,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
 def _validate_consolidation_mode_config(
         current_is_consolidation_mode: bool) -> None:
     """Validate the consolidation mode config."""
-    if (current_is_consolidation_mode and
-            not env_options.Options.IS_DEVELOPER.get() and
-            server_common.is_api_server_local()):
-        with ux_utils.print_exception_no_traceback():
-            raise exceptions.NotSupportedError(
-                'Consolidation mode is not supported when running locally.')
     # Check whether the consolidation mode config is changed.
     if current_is_consolidation_mode:
         controller_cn = (
@@ -239,8 +231,8 @@ def ha_recovery_for_consolidation_mode():
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
-def get_job_status(backend: 'backends.CloudVmRayBackend',
-                   cluster_name: str) -> Optional['job_lib.JobStatus']:
+def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
+                   job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -253,10 +245,13 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
         logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
+    job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
             logger.info('=== Checking the job status... ===')
-            statuses = backend.get_job_status(handle, stream_logs=False)
+            statuses = backend.get_job_status(handle,
+                                              job_ids=job_ids,
+                                              stream_logs=False)
             status = list(statuses.values())[0]
             if status is None:
                 logger.info('No job found.')
@@ -323,13 +318,20 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         error_msg = None
         tasks = managed_job_state.get_managed_jobs(job_id)
         for task in tasks:
-            task_name = task['job_name']
-            cluster_name = generate_managed_job_cluster_name(task_name, job_id)
+            pool = task.get('pool', None)
+            if pool is None:
+                task_name = task['job_name']
+                cluster_name = generate_managed_job_cluster_name(
+                    task_name, job_id)
+            else:
+                cluster_name, _ = (
+                    managed_job_state.get_pool_submit_info(job_id))
             handle = global_user_state.get_handle_from_cluster_name(
                 cluster_name)
             if handle is not None:
                 try:
-                    terminate_cluster(cluster_name)
+                    if pool is None:
+                        terminate_cluster(cluster_name)
                 except Exception as e:  # pylint: disable=broad-except
                     error_msg = (
                         f'Failed to terminate cluster {cluster_name}: '
@@ -510,10 +512,10 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
-                      get_end_time: bool) -> float:
+                      job_id: Optional[int], get_end_time: bool) -> float:
     """Get the submitted/ended time of the job."""
     code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
-        job_id=None, get_ended_time=get_end_time)
+        job_id=job_id, get_ended_time=get_end_time)
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     returncode, stdout, stderr = backend.run_on_head(handle,
                                                      code,
@@ -527,14 +529,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
 def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
-                            cluster_name: str) -> float:
+                            cluster_name: str, job_id: Optional[int]) -> float:
     """Try to get the end time of the job.
     If the job is preempted or we can't connect to the instance for whatever
     reason, fall back to the current time.
     """
     try:
-        return get_job_timestamp(backend, cluster_name, get_end_time=True)
+        return get_job_timestamp(backend,
+                                 cluster_name,
+                                 job_id=job_id,
+                                 get_end_time=True)
     except exceptions.CommandError as e:
         if e.returncode == 255:
             # Failed to connect - probably the instance was preempted since the
@@ -556,8 +561,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
         if event_callback is None or task is None:
             return
         event_callback = event_callback.strip()
-        cluster_name = generate_managed_job_cluster_name(
-            task.name, job_id) if task.name else None
+        pool = managed_job_state.get_pool_from_job_id(job_id)
+        if pool is not None:
+            cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                task.name, job_id) if task.name else None
         logger.info(f'=== START: event callback for {status!r} ===')
         log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                 'managed_job_event',
@@ -684,6 +693,15 @@ def cancel_job_by_name(job_name: str,
     return f'{job_name!r} {msg}'
+def cancel_jobs_by_pool(pool_name: str,
+                        current_workspace: Optional[str] = None) -> str:
+    """Cancel all jobs in a pool."""
+    job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
+    if not job_ids:
+        return f'No running job found in pool {pool_name!r}.'
+    return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
 def stream_logs_by_id(job_id: int,
                       follow: bool = True,
                       tail: Optional[int] = None) -> Tuple[str, int]:
@@ -777,12 +795,19 @@ def stream_logs_by_id(job_id: int,
         while should_keep_logging(managed_job_status):
             handle = None
+            job_id_to_tail = None
             if task_id is not None:
-                task_name = managed_job_state.get_task_name(job_id, task_id)
-                cluster_name = generate_managed_job_cluster_name(
-                    task_name, job_id)
-                handle = global_user_state.get_handle_from_cluster_name(
-                    cluster_name)
+                pool = managed_job_state.get_pool_from_job_id(job_id)
+                if pool is not None:
+                    cluster_name, job_id_to_tail = (
+                        managed_job_state.get_pool_submit_info(job_id))
+                else:
+                    task_name = managed_job_state.get_task_name(job_id, task_id)
+                    cluster_name = generate_managed_job_cluster_name(
+                        task_name, job_id)
+                if cluster_name is not None:
+                    handle = global_user_state.get_handle_from_cluster_name(
+                        cluster_name)
             # Check the handle: The cluster can be preempted and removed from
             # the table before the managed job state is updated by the
@@ -814,7 +839,7 @@ def stream_logs_by_id(job_id: int,
             status_display.stop()
             tail_param = tail if tail is not None else 0
             returncode = backend.tail_logs(handle,
-                                           job_id=None,
+                                           job_id=job_id_to_tail,
                                            managed_job_id=job_id,
                                            follow=follow,
                                            tail=tail_param)
@@ -1132,9 +1157,15 @@ def dump_managed_job_queue() -> str:
         job['status'] = job['status'].value
         job['schedule_state'] = job['schedule_state'].value
-        cluster_name = generate_managed_job_cluster_name(
-            job['task_name'], job['job_id'])
-        handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+        pool = managed_job_state.get_pool_from_job_id(job['job_id'])
+        if pool is not None:
+            cluster_name, _ = managed_job_state.get_pool_submit_info(
+                job['job_id'])
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                job['task_name'], job['job_id'])
+        handle = global_user_state.get_handle_from_cluster_name(
+            cluster_name) if cluster_name is not None else None
         if isinstance(handle, backends.CloudVmRayResourceHandle):
             resources_str = resources_utils.get_readable_resources_repr(
                 handle, simplify=True)
@@ -1145,6 +1176,11 @@ def dump_managed_job_queue() -> str:
             job['cloud'] = str(handle.launched_resources.cloud)
             job['region'] = handle.launched_resources.region
             job['zone'] = handle.launched_resources.zone
+            job['infra'] = infra_utils.InfraInfo(
+                str(handle.launched_resources.cloud),
+                handle.launched_resources.region,
+                handle.launched_resources.zone).formatted_str()
+            job['accelerators'] = handle.launched_resources.accelerators
         else:
             # FIXME(zongheng): display the last cached values for these.
             job['cluster_resources'] = '-'
@@ -1152,6 +1188,7 @@ def dump_managed_job_queue() -> str:
             job['cloud'] = '-'
             job['region'] = '-'
             job['zone'] = '-'
+            job['infra'] = '-'
         # Add details about schedule state / backoff.
         state_details = None
@@ -1292,10 +1329,13 @@ def format_job_table(
         'JOB DURATION',
         '#RECOVERIES',
         'STATUS',
+        'WORKER_POOL',
     ]
     if show_all:
         # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
         columns += [
+            'WORKER_CLUSTER',
+            'WORKER_JOB_ID',
             'STARTED',
             'INFRA',
             'RESOURCES',
@@ -1405,11 +1445,14 @@ def format_job_table(
                 job_duration,
                 recovery_cnt,
                 status_str,
+                job_tasks[0].get('pool', '-'),
             ]
             if show_all:
                 details = job_tasks[current_task_id].get('details')
                 failure_reason = job_tasks[current_task_id]['failure_reason']
                 job_values.extend([
+                    '-',
+                    '-',
                     '-',
                     '-',
                     '-',
@@ -1445,37 +1488,43 @@ def format_job_table(
                 job_duration,
                 task['recovery_count'],
                 task['status'].colored_str(),
+                task.get('pool', '-'),
             ]
             if show_all:
                 # schedule_state is only set at the job level, so if we have
                 # more than one task, only display on the aggregated row.
                 schedule_state = (task['schedule_state']
                                   if len(job_tasks) == 1 else '-')
-                cloud = task.get('cloud')
-                if cloud is None:
-                    # Backward compatibility for old jobs controller without
-                    # cloud info returned, we parse it from the cluster
-                    # resources
-                    # TODO(zhwu): remove this after 0.12.0
-                    cloud = task['cluster_resources'].split('(')[0].split(
-                        'x')[-1]
-                    task['cluster_resources'] = task[
-                        'cluster_resources'].replace(f'{cloud}(',
-                                                     '(').replace('x ', 'x')
-                region = task['region']
-                zone = task.get('zone')
-                if cloud == '-':
-                    cloud = None
-                if region == '-':
-                    region = None
-                if zone == '-':
-                    zone = None
-                infra = infra_utils.InfraInfo(cloud, region, zone)
+                infra_str = task.get('infra')
+                if infra_str is None:
+                    cloud = task.get('cloud')
+                    if cloud is None:
+                        # Backward compatibility for old jobs controller without
+                        # cloud info returned, we parse it from the cluster
+                        # resources
+                        # TODO(zhwu): remove this after 0.12.0
+                        cloud = task['cluster_resources'].split('(')[0].split(
+                            'x')[-1]
+                        task['cluster_resources'] = task[
+                            'cluster_resources'].replace(f'{cloud}(',
+                                                         '(').replace(
+                                                             'x ', 'x')
+                    region = task['region']
+                    zone = task.get('zone')
+                    if cloud == '-':
+                        cloud = None
+                    if region == '-':
+                        region = None
+                    if zone == '-':
+                        zone = None
+                    infra_str = infra_utils.InfraInfo(cloud, region,
+                                                      zone).formatted_str()
                 values.extend([
+                    task.get('current_cluster_name', '-'),
+                    task.get('job_id_on_pool_cluster', '-'),
                     # STARTED
                     log_utils.readable_time_duration(task['start_at']),
-                    infra.formatted_str(),
+                    infra_str,
                     task['cluster_resources'],
                     schedule_state,
                     generate_details(task.get('details'),
@@ -1567,6 +1616,15 @@ class ManagedJobCodeGen:
         """)
         return cls._build(code)
+    @classmethod
+    def cancel_jobs_by_pool(cls, pool_name: str) -> str:
+        active_workspace = skypilot_config.get_active_workspace()
+        code = textwrap.dedent(f"""\
+            msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
+            print(msg, end="", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def get_version_and_job_table(cls) -> str:
         """Generate code to get controller version and raw job table."""

sky/provision/nebius/constants.py CHANGED Viewed

@@ -15,6 +15,9 @@ INFINIBAND_ENV_VARS = {
                         'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
 }
+# pylint: disable=line-too-long
+INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
 # Docker run options for InfiniBand support
 INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']

sky/py.typed ADDED Viewed

File without changes

sky/resources.py CHANGED Viewed

@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
 from sky.provision import docker_utils
 from sky.provision.gcp import constants as gcp_constants
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.provision.nebius import constants as nebius_constants
 from sky.skylet import constants
 from sky.utils import accelerator_registry
 from sky.utils import annotations
@@ -1260,15 +1261,19 @@ class Resources:
             ValueError: if the attribute is invalid.
         """
-        if (self._network_tier == resources_utils.NetworkTier.BEST and
-                isinstance(self._cloud, clouds.GCP)):
-            # Handle GPU Direct TCPX requirement for docker images
-            if self._image_id is None:
-                # No custom image specified - use the default GPU Direct image
-                self._image_id = {
-                    self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
-                }
-            else:
+        if self._network_tier == resources_utils.NetworkTier.BEST:
+            if isinstance(self._cloud, clouds.GCP):
+                # Handle GPU Direct TCPX requirement for docker images
+                if self._image_id is None:
+                    self._image_id = {
+                        self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
+                    }
+            elif isinstance(self._cloud, clouds.Nebius):
+                if self._image_id is None:
+                    self._image_id = {
+                        self._region: nebius_constants.INFINIBAND_IMAGE_ID
+                    }
+            elif self._image_id:
                 # Custom image specified - validate it's a docker image
                 # Check if any of the specified images are not docker images
                 non_docker_images = []
@@ -1280,14 +1285,13 @@ class Resources:
                 if non_docker_images:
                     with ux_utils.print_exception_no_traceback():
                         raise ValueError(
-                            f'When using network_tier=BEST on GCP, image_id '
+                            f'When using network_tier=BEST, image_id '
                             f'must be a docker image. '
                             f'Found non-docker images: '
                             f'{", ".join(non_docker_images)}. '
                             f'Please either: (1) use a docker image '
                             f'(prefix with "docker:"), or '
-                            f'(2) leave image_id empty to use the default '
-                            f'GPU Direct TCPX image.')
+                            f'(2) leave image_id empty to use the default')
         if self._image_id is None:
             return

sky/schemas/db/spot_jobs/002_cluster_pool.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Columns for cluster pool.
+Revision ID: 002
+Revises: 001
+Create Date: 2025-07-18
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '002'
+down_revision: Union[str, Sequence[str], None] = '001'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add columns for cluster pool."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'pool',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'current_cluster_name',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'job_id_on_pool_cluster',
+                                             sa.Integer(),
+                                             server_default=None)
+def downgrade():
+    """Remove columns for cluster pool."""
+    pass

skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl