PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250611__py3-none-any.whl → 1.0.0.dev20250613__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250611py3-none-any.whl → 1.0.0.dev20250613py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +3 -2
sky/backends/backend_utils.py +8 -2
sky/benchmark/benchmark_state.py +2 -1
sky/catalog/data_fetchers/fetch_aws.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +1 -1
sky/check.py +43 -3
sky/cli.py +1 -1
sky/client/cli.py +1 -1
sky/clouds/cloud.py +1 -1
sky/clouds/gcp.py +1 -1
sky/clouds/kubernetes.py +9 -3
sky/clouds/ssh.py +7 -3
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-208a9812ab4f61c9.js → webpack-5c3e6471d04780c6.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +2 -2
sky/global_user_state.py +38 -0
sky/jobs/server/core.py +1 -68
sky/jobs/state.py +43 -44
sky/provision/common.py +1 -1
sky/provision/gcp/config.py +1 -1
sky/provision/kubernetes/instance.py +2 -1
sky/provision/kubernetes/utils.py +60 -13
sky/resources.py +2 -2
sky/serve/serve_state.py +81 -15
sky/server/requests/preconditions.py +1 -1
sky/server/requests/requests.py +11 -6
sky/skylet/configs.py +26 -19
sky/skylet/job_lib.py +3 -5
sky/task.py +1 -1
sky/templates/jobs-controller.yaml.j2 +0 -23
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/utils/common_utils.py +6 -0
sky/utils/context.py +1 -1
sky/utils/controller_utils.py +10 -0
sky/utils/infra_utils.py +1 -1
sky/utils/kubernetes/generate_kubeconfig.sh +1 -1
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/RECORD +58 -62
sky/jobs/dashboard/dashboard.py +0 -223
sky/jobs/dashboard/static/favicon.ico +0 -0
sky/jobs/dashboard/templates/index.html +0 -831
sky/jobs/server/dashboard_utils.py +0 -69
/sky/dashboard/out/_next/static/{zJqasksBQ3HcqMpA2wTUZ → UdgJCk2sZFLJgFJW_qiWG}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{zJqasksBQ3HcqMpA2wTUZ → UdgJCk2sZFLJgFJW_qiWG}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250611.dist-info → skypilot_nightly-1.0.0.dev20250613.dist-info}/top_level.txt +0 -0

sky/global_user_state.py CHANGED Viewed

@@ -45,6 +45,7 @@ if typing.TYPE_CHECKING:
 logger = sky_logging.init_logger(__name__)
 _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
+_ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
 _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
 _DB_INIT_LOCK = threading.Lock()
@@ -1087,6 +1088,43 @@ def _get_enabled_clouds_key(cloud_capability: 'cloud.CloudCapability',
     return _ENABLED_CLOUDS_KEY_PREFIX + workspace + '_' + cloud_capability.value
+@_init_db
+def get_allowed_clouds(workspace: str) -> List[str]:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.query(config_table).filter_by(
+            key=_get_allowed_clouds_key(workspace)).first()
+    if row:
+        return json.loads(row.value)
+    return []
+@_init_db
+def set_allowed_clouds(allowed_clouds: List[str], workspace: str) -> None:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        if (_SQLALCHEMY_ENGINE.dialect.name ==
+                db_utils.SQLAlchemyDialect.SQLITE.value):
+            insert_func = sqlite.insert
+        elif (_SQLALCHEMY_ENGINE.dialect.name ==
+              db_utils.SQLAlchemyDialect.POSTGRESQL.value):
+            insert_func = postgresql.insert
+        else:
+            raise ValueError('Unsupported database dialect')
+        insert_stmnt = insert_func(config_table).values(
+            key=_get_allowed_clouds_key(workspace),
+            value=json.dumps(allowed_clouds))
+        do_update_stmt = insert_stmnt.on_conflict_do_update(
+            index_elements=[config_table.c.key],
+            set_={config_table.c.value: json.dumps(allowed_clouds)})
+        session.execute(do_update_stmt)
+        session.commit()
+def _get_allowed_clouds_key(workspace: str) -> str:
+    return _ALLOWED_CLOUDS_KEY_PREFIX + workspace
 @_init_db
 def add_or_update_storage(storage_name: str,
                           storage_handle: 'Storage.StorageMetadata',

sky/jobs/server/core.py CHANGED Viewed

@@ -1,9 +1,6 @@
 """SDK functions for managed jobs."""
 import os
-import signal
-import subprocess
 import tempfile
-import time
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
 import uuid
@@ -213,8 +210,6 @@ def launch(
             'remote_env_file_path': remote_env_file_path,
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),
-            'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
-            'dashboard_user_id': common.SERVER_ID,
             'priority': priority,
             **controller_utils.shared_controller_vars_to_fill(
                 controller,
@@ -368,20 +363,7 @@ def _maybe_restart_controller(
             skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
         handle = core.start(
             cluster_name=jobs_controller_type.value.cluster_name)
-    # Make sure the dashboard is running when the controller is restarted.
-    # We should not directly use execution.launch() and have the dashboard cmd
-    # in the task setup because since we are using detached_setup, it will
-    # become a job on controller which messes up the job IDs (we assume the
-    # job ID in controller's job queue is consistent with managed job IDs).
-    with rich_utils.safe_status(
-            ux_utils.spinner_message('Starting dashboard...')):
-        runner = handle.get_command_runners()[0]
-        runner.run(
-            f'export '
-            f'{skylet_constants.USER_ID_ENV_VAR}={common.SERVER_ID!r}; '
-            f'{managed_job_constants.DASHBOARD_SETUP_CMD}',
-            stream_logs=True,
-        )
     controller_status = status_lib.ClusterStatus.UP
     rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
@@ -598,55 +580,6 @@ def tail_logs(name: Optional[str],
                                          tail=tail)
-def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
-    """Opens a dashboard for managed jobs (needs controller to be UP)."""
-    # TODO(SKY-1212): ideally, the controller/dashboard server should expose the
-    # API perhaps via REST. Then here we would (1) not have to use SSH to try to
-    # see if the controller is UP first, which is slow; (2) not have to run SSH
-    # port forwarding first (we'd just launch a local dashboard which would make
-    # REST API calls to the controller dashboard server).
-    logger.info('Starting dashboard')
-    hint = ('Dashboard is not available if jobs controller is not up. Run '
-            'a managed job first or run: sky jobs queue --refresh')
-    handle = _maybe_restart_controller(
-        refresh=refresh,
-        stopped_message=hint,
-        spinner_message='Checking jobs controller')
-    # SSH forward a free local port to remote's dashboard port.
-    remote_port = skylet_constants.SPOT_DASHBOARD_REMOTE_PORT
-    free_port = common_utils.find_free_port(remote_port)
-    runner = handle.get_command_runners()[0]
-    port_forward_command = ' '.join(
-        runner.port_forward_command(port_forward=[(free_port, remote_port)],
-                                    connect_timeout=1))
-    port_forward_command = (
-        f'{port_forward_command} '
-        f'> ~/sky_logs/api_server/dashboard-{common_utils.get_user_hash()}.log '
-        '2>&1')
-    logger.info(f'Forwarding port: {colorama.Style.DIM}{port_forward_command}'
-                f'{colorama.Style.RESET_ALL}')
-    ssh_process = subprocess.Popen(port_forward_command,
-                                   shell=True,
-                                   start_new_session=True)
-    time.sleep(3)  # Added delay for ssh_command to initialize.
-    logger.info(f'{colorama.Fore.GREEN}Dashboard is now available at: '
-                f'http://127.0.0.1:{free_port}{colorama.Style.RESET_ALL}')
-    return free_port, ssh_process.pid
-def stop_dashboard_forwarding(pid: int) -> None:
-    # Exit the ssh command when the context manager is closed.
-    try:
-        os.killpg(os.getpgid(pid), signal.SIGTERM)
-    except ProcessLookupError:
-        # This happens if jobs controller is auto-stopped.
-        pass
-    logger.info('Forwarding port closed. Exiting.')
 @usage_lib.entrypoint
 def download_logs(
         name: Optional[str],

sky/jobs/state.py CHANGED Viewed

@@ -161,8 +161,8 @@ def create_table(cursor, conn):
     conn.commit()
-# Module-level connection/cursor; thread-safe as the module is only imported
-# once.
+# Module-level connection/cursor; thread-safe as the db is initialized once
+# across all threads.
 def _get_db_path() -> str:
     """Workaround to collapse multi-step Path ops for type checker.
     Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
@@ -173,8 +173,7 @@ def _get_db_path() -> str:
     return str(path)
-_DB_PATH = _get_db_path()
-_db_initialized = False
+_DB_PATH = None
 _db_init_lock = threading.Lock()
@@ -183,13 +182,13 @@ def _init_db(func):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        global _db_initialized
-        if _db_initialized:
+        global _DB_PATH
+        if _DB_PATH is not None:
             return func(*args, **kwargs)
         with _db_init_lock:
-            if not _db_initialized:
+            if _DB_PATH is None:
+                _DB_PATH = _get_db_path()
                 db_utils.SQLiteConn(_DB_PATH, create_table)
-                _db_initialized = True
         return func(*args, **kwargs)
     return wrapper
@@ -442,7 +441,7 @@ class ManagedJobScheduleState(enum.Enum):
 # === Status transition functions ===
 @_init_db
 def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -456,7 +455,7 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
 @_init_db
 def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
     """Set the task to pending state."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -484,7 +483,7 @@ def set_starting(job_id: int, task_id: int, run_timestamp: str,
         specs: The specs of the managed task.
         callback_func: The callback function.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
     # make it easier to find them based on one of the values.
@@ -524,7 +523,7 @@ def set_backoff_pending(job_id: int, task_id: int):
     This should only be used to transition from STARTING or RECOVERING back to
     PENDING.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -552,7 +551,7 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
     after using set_backoff_pending to transition back to PENDING during
     launch retry backoff.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     target_status = ManagedJobStatus.STARTING.value
     if recovering:
         target_status = ManagedJobStatus.RECOVERING.value
@@ -578,7 +577,7 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
 def set_started(job_id: int, task_id: int, start_time: float,
                 callback_func: CallbackType):
     """Set the task to started state."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     logger.info('Job started.')
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
@@ -610,7 +609,7 @@ def set_started(job_id: int, task_id: int, start_time: float,
 @_init_db
 def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
     """Set the task to recovering state, and update the job duration."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     logger.info('=== Recovering... ===')
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
@@ -634,7 +633,7 @@ def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
 def set_recovered(job_id: int, task_id: int, recovered_time: float,
                   callback_func: CallbackType):
     """Set the task to recovered."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -658,7 +657,7 @@ def set_recovered(job_id: int, task_id: int, recovered_time: float,
 def set_succeeded(job_id: int, task_id: int, end_time: float,
                   callback_func: CallbackType):
     """Set the task to succeeded, if it is in a non-terminal state."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
@@ -703,7 +702,7 @@ def set_failed(
         override_terminal: If True, override the current status even if end_at
             is already set.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     assert failure_type.is_failed(), failure_type
     end_time = time.time() if end_time is None else end_time
@@ -761,7 +760,7 @@ def set_cancelling(job_id: int, callback_func: CallbackType):
     task_id is not needed, because we expect the job should be cancelled
     as a whole, and we should not cancel a single task.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(
             """\
@@ -783,7 +782,7 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
     The set_cancelling should be called before this function.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(
             """\
@@ -804,7 +803,7 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
 def set_local_log_file(job_id: int, task_id: Optional[int],
                        local_log_file: str):
     """Set the local log file for a job."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     filter_str = 'spot_job_id=(?)'
     filter_args = [local_log_file, job_id]
@@ -822,7 +821,7 @@ def set_local_log_file(job_id: int, task_id: Optional[int],
 def get_nonterminal_job_ids_by_name(name: Optional[str],
                                     all_users: bool = False) -> List[int]:
     """Get non-terminal job ids by name."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     statuses = ', '.join(['?'] * len(ManagedJobStatus.terminal_statuses()))
     field_values = [
         status.value for status in ManagedJobStatus.terminal_statuses()
@@ -866,7 +865,7 @@ def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
     exception: the job may have just transitioned from WAITING to LAUNCHING, but
     the controller process has not yet started.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     job_filter = '' if job_id is None else 'AND spot_job_id=(?)'
     job_value = (job_id,) if job_id is not None else ()
@@ -909,7 +908,7 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
     - Jobs have schedule_state DONE but are in a non-terminal status
     - Legacy jobs (that is, no schedule state) that are in non-terminal status
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
     job_value = () if job_id is None else (job_id,)
@@ -958,7 +957,7 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
 @_init_db
 def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
     """Get all job ids by name."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     name_filter = ''
     field_values = []
     if name is not None:
@@ -987,7 +986,7 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
 @_init_db
 def _get_all_task_ids_statuses(
         job_id: int) -> List[Tuple[int, ManagedJobStatus]]:
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         id_statuses = cursor.execute(
             """\
@@ -1035,7 +1034,7 @@ def get_failure_reason(job_id: int) -> Optional[str]:
     If the job has multiple tasks, we return the first failure reason.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         reason = cursor.execute(
             """\
@@ -1051,7 +1050,7 @@ def get_failure_reason(job_id: int) -> Optional[str]:
 @_init_db
 def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
     """Get managed jobs from the database."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     job_filter = '' if job_id is None else f'WHERE spot.spot_job_id={job_id}'
     # Join spot and job_info tables to get the job name for each task.
@@ -1097,7 +1096,7 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
 @_init_db
 def get_task_name(job_id: int, task_id: int) -> str:
     """Get the task name of a job."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         task_name = cursor.execute(
             """\
@@ -1110,7 +1109,7 @@ def get_task_name(job_id: int, task_id: int) -> str:
 @_init_db
 def get_latest_job_id() -> Optional[int]:
     """Get the latest job id."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute("""\
             SELECT spot_job_id FROM spot
@@ -1123,7 +1122,7 @@ def get_latest_job_id() -> Optional[int]:
 @_init_db
 def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         task_specs = cursor.execute(
             """\
@@ -1136,7 +1135,7 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
 @_init_db
 def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
     """Get the local log directory for a job."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     filter_str = 'spot_job_id=(?)'
     filter_args = [job_id]
     if task_id is not None:
@@ -1159,7 +1158,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
                           original_user_yaml_path: str, env_file_path: str,
                           user_hash: str, priority: int) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1177,7 +1176,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
 def scheduler_set_launching(job_id: int,
                             current_state: ManagedJobScheduleState) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1191,7 +1190,7 @@ def scheduler_set_launching(job_id: int,
 @_init_db
 def scheduler_set_alive(job_id: int) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1205,7 +1204,7 @@ def scheduler_set_alive(job_id: int) -> None:
 @_init_db
 def scheduler_set_alive_backoff(job_id: int) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1219,7 +1218,7 @@ def scheduler_set_alive_backoff(job_id: int) -> None:
 @_init_db
 def scheduler_set_alive_waiting(job_id: int) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1234,7 +1233,7 @@ def scheduler_set_alive_waiting(job_id: int) -> None:
 @_init_db
 def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
     """Do not call without holding the scheduler lock."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1248,7 +1247,7 @@ def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
 @_init_db
 def set_job_controller_pid(job_id: int, pid: int):
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
             'UPDATE job_info SET '
@@ -1259,7 +1258,7 @@ def set_job_controller_pid(job_id: int, pid: int):
 @_init_db
 def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         state = cursor.execute(
             'SELECT schedule_state FROM job_info WHERE spot_job_id = (?)',
@@ -1269,7 +1268,7 @@ def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
 @_init_db
 def get_num_launching_jobs() -> int:
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         return cursor.execute(
             'SELECT COUNT(*) '
@@ -1280,7 +1279,7 @@ def get_num_launching_jobs() -> int:
 @_init_db
 def get_num_alive_jobs() -> int:
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         return cursor.execute(
             'SELECT COUNT(*) '
@@ -1303,7 +1302,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
     Backwards compatibility note: jobs submitted before #4485 will have no
     schedule_state and will be ignored by this SQL query.
     """
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         # Get the highest-priority WAITING or ALIVE_WAITING job whose priority
         # is greater than or equal to the highest priority LAUNCHING or
@@ -1338,7 +1337,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
 @_init_db
 def get_workspace(job_id: int) -> str:
     """Get the workspace of a job."""
-    assert _db_initialized
+    assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         workspace = cursor.execute(
             'SELECT workspace FROM job_info WHERE spot_job_id = (?)',

sky/provision/common.py CHANGED Viewed

@@ -238,7 +238,7 @@ class Endpoint:
 @dataclasses.dataclass
 class SocketEndpoint(Endpoint):
-    """Socket endpoint accesible via a host and a port."""
+    """Socket endpoint accessible via a host and a port."""
     port: Optional[int]
     host: str = ''

sky/provision/gcp/config.py CHANGED Viewed

@@ -274,7 +274,7 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
     # For example, `roles/iam.serviceAccountUser` can be granted at the
     # skypilot-v1 service account level, which can be checked with
     # service_account_policy = iam.projects().serviceAccounts().getIamPolicy(
-    #    resource=f'projects/{project_id}/serviceAcccounts/{email}').execute()
+    #    resource=f'projects/{project_id}/serviceAccounts/{email}').execute()
     # We now skip the check for `iam.serviceAccounts.actAs` permission for
     # simplicity as it can be granted at the service account level.
     def check_permissions(policy, required_permissions):

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -1277,7 +1277,8 @@ def query_instances(
     except kubernetes.max_retry_error():
         with ux_utils.print_exception_no_traceback():
             if is_ssh:
-                node_pool = context.lstrip('ssh-') if context else ''
+                node_pool = common_utils.removeprefix(context,
+                                                      'ssh-') if context else ''
                 msg = (
                     f'Cannot connect to SSH Node Pool {node_pool}. '
                     'Please check if the SSH Node Pool is up and accessible. '

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -133,6 +133,30 @@ DEFAULT_MAX_RETRIES = 3
 DEFAULT_RETRY_INTERVAL_SECONDS = 1
+def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
+    """Normalize TPU names to the k8s-compatible name and extract count."""
+    # Examples:
+    # 'tpu-v6e-8' -> ('tpu-v6e-slice', 8)
+    # 'tpu-v5litepod-4' -> ('tpu-v5-lite-podslice', 4)
+    gcp_to_k8s_patterns = [
+        (r'^tpu-v6e-(\d+)$', 'tpu-v6e-slice'),
+        (r'^tpu-v5p-(\d+)$', 'tpu-v5p-slice'),
+        (r'^tpu-v5litepod-(\d+)$', 'tpu-v5-lite-podslice'),
+        (r'^tpu-v5lite-(\d+)$', 'tpu-v5-lite-device'),
+        (r'^tpu-v4-(\d+)$', 'tpu-v4-podslice'),
+    ]
+    for pattern, replacement in gcp_to_k8s_patterns:
+        match = re.match(pattern, accelerator)
+        if match:
+            count = int(match.group(1))
+            return replacement, count
+    # Default fallback
+    return accelerator, 1
 def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
                     retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
                     resource_type: Optional[str] = None):
@@ -427,6 +451,7 @@ class GKELabelFormatter(GPULabelFormatter):
         e.g. tpu-v5-lite-podslice:8 -> '2x4'
         """
+        acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
         count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
                                                        {}).get(acc_count, None)
         if count_to_topology is None:
@@ -461,6 +486,14 @@ class GKELabelFormatter(GPULabelFormatter):
             raise ValueError(
                 f'Invalid accelerator name in GKE cluster: {value}')
+    @classmethod
+    def validate_label_value(cls, value: str) -> Tuple[bool, str]:
+        try:
+            _ = cls.get_accelerator_from_label_value(value)
+            return True, ''
+        except ValueError as e:
+            return False, str(e)
 class GFDLabelFormatter(GPULabelFormatter):
     """GPU Feature Discovery label formatter
@@ -565,17 +598,29 @@ def detect_gpu_label_formatter(
         for label, value in node.metadata.labels.items():
             node_labels[node.metadata.name].append((label, value))
-    label_formatter = None
     # Check if the node labels contain any of the GPU label prefixes
     for lf in LABEL_FORMATTER_REGISTRY:
+        skip = False
         for _, label_list in node_labels.items():
-            for label, _ in label_list:
+            for label, value in label_list:
                 if lf.match_label_key(label):
-                    label_formatter = lf()
-                    return label_formatter, node_labels
+                    valid, reason = lf.validate_label_value(value)
+                    if valid:
+                        return lf(), node_labels
+                    else:
+                        logger.warning(f'GPU label {label} matched for label '
+                                       f'formatter {lf.__class__.__name__}, '
+                                       f'but has invalid value {value}. '
+                                       f'Reason: {reason}. '
+                                       'Skipping...')
+                        skip = True
+                        break
+            if skip:
+                break
+        if skip:
+            continue
-    return label_formatter, node_labels
+    return None, node_labels
 class Autoscaler:
@@ -754,6 +799,8 @@ class GKEAutoscaler(Autoscaler):
                     f'checking {node_pool_name} for TPU {requested_acc_type}:'
                     f'{requested_acc_count}')
                 if 'resourceLabels' in node_config:
+                    requested_acc_type, requested_acc_count = normalize_tpu_accelerator_name(
+                        requested_acc_type)
                     accelerator_exists = cls._node_pool_has_tpu_capacity(
                         node_config['resourceLabels'], machine_type,
                         requested_acc_type, requested_acc_count)
@@ -993,7 +1040,7 @@ def check_instance_fits(context: Optional[str],
             'Maximum resources found on a single node: '
             f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
-    def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
+    def check_tpu_fits(acc_type: str, acc_count: int,
                        node_list: List[Any]) -> Tuple[bool, Optional[str]]:
         """Checks if the instance fits on the cluster based on requested TPU.
@@ -1003,8 +1050,6 @@ def check_instance_fits(context: Optional[str],
         node (node_tpu_chip_count) and the total TPU chips across the entire
         podslice (topology_chip_count) are correctly handled.
         """
-        acc_type = candidate_instance_type.accelerator_type
-        acc_count = candidate_instance_type.accelerator_count
         tpu_list_in_cluster = []
         for node in node_list:
             if acc_type == node.metadata.labels[
@@ -1055,7 +1100,8 @@ def check_instance_fits(context: Optional[str],
         if is_tpu_on_gke(acc_type):
             # If requested accelerator is a TPU type, check if the cluster
             # has sufficient TPU resource to meet the requirement.
-            fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
+            acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
+            fits, reason = check_tpu_fits(acc_type, acc_count, gpu_nodes)
             if reason is not None:
                 return fits, reason
         else:
@@ -1141,8 +1187,8 @@ def get_accelerator_label_key_values(
     is_ssh_node_pool = context.startswith('ssh-') if context else False
     cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
-    context_display_name = context.lstrip('ssh-') if (
-        context and is_ssh_node_pool) else context
+    context_display_name = common_utils.removeprefix(
+        context, 'ssh-') if (context and is_ssh_node_pool) else context
     autoscaler_type = get_autoscaler_type()
     if autoscaler_type is not None:
@@ -2911,7 +2957,8 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
 def is_tpu_on_gke(accelerator: str) -> bool:
     """Determines if the given accelerator is a TPU supported on GKE."""
-    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
+    normalized, _ = normalize_tpu_accelerator_name(accelerator)
+    return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
 def get_node_accelerator_count(attribute_dict: dict) -> int:

skypilot-nightly 1.0.0.dev20250611__py3-none-any.whl → 1.0.0.dev20250613__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250611py3-none-any.whl → 1.0.0.dev20250613py3-none-any.whl