skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +125 -22
- sky/backends/cloud_vm_ray_backend.py +224 -72
- sky/catalog/__init__.py +7 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +18 -0
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +2 -71
- sky/client/sdk_async.py +5 -2
- sky/clouds/aws.py +23 -5
- sky/clouds/cloud.py +8 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +34 -0
- sky/jobs/client/sdk_async.py +4 -2
- sky/jobs/controller.py +4 -2
- sky/jobs/recovery_strategy.py +1 -1
- sky/jobs/state.py +26 -16
- sky/jobs/utils.py +6 -11
- sky/logs/agent.py +10 -2
- sky/provision/kubernetes/config.py +7 -2
- sky/provision/kubernetes/instance.py +84 -41
- sky/provision/vast/instance.py +1 -1
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/server/config.py +14 -5
- sky/server/metrics.py +41 -8
- sky/server/requests/executor.py +41 -4
- sky/server/server.py +1 -0
- sky/server/uvicorn.py +11 -5
- sky/skylet/constants.py +12 -7
- sky/skylet/log_lib.py +11 -0
- sky/skylet/log_lib.pyi +9 -0
- sky/task.py +62 -0
- sky/templates/kubernetes-ray.yml.j2 +120 -3
- sky/utils/accelerator_registry.py +3 -1
- sky/utils/command_runner.py +35 -11
- sky/utils/command_runner.pyi +22 -0
- sky/utils/context_utils.py +15 -2
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/git.py +559 -1
- sky/utils/resource_checker.py +8 -7
- sky/workspaces/core.py +57 -21
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-86cabed5d4669ad0.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
|
@@ -118,6 +118,9 @@ cluster_table = sqlalchemy.Table(
|
|
|
118
118
|
sqlalchemy.Column('provision_log_path',
|
|
119
119
|
sqlalchemy.Text,
|
|
120
120
|
server_default=None),
|
|
121
|
+
sqlalchemy.Column('skylet_ssh_tunnel_metadata',
|
|
122
|
+
sqlalchemy.LargeBinary,
|
|
123
|
+
server_default=None),
|
|
121
124
|
)
|
|
122
125
|
|
|
123
126
|
storage_table = sqlalchemy.Table(
|
|
@@ -1170,6 +1173,37 @@ def set_cluster_storage_mounts_metadata(
|
|
|
1170
1173
|
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
1171
1174
|
|
|
1172
1175
|
|
|
1176
|
+
@_init_db
|
|
1177
|
+
@metrics_lib.time_me
|
|
1178
|
+
def get_cluster_skylet_ssh_tunnel_metadata(
|
|
1179
|
+
cluster_name: str) -> Optional[Tuple[int, int]]:
|
|
1180
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1181
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1182
|
+
row = session.query(cluster_table).filter_by(name=cluster_name).first()
|
|
1183
|
+
if row is None or row.skylet_ssh_tunnel_metadata is None:
|
|
1184
|
+
return None
|
|
1185
|
+
return pickle.loads(row.skylet_ssh_tunnel_metadata)
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
@_init_db
|
|
1189
|
+
@metrics_lib.time_me
|
|
1190
|
+
def set_cluster_skylet_ssh_tunnel_metadata(
|
|
1191
|
+
cluster_name: str,
|
|
1192
|
+
skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
|
|
1193
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1194
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1195
|
+
value = pickle.dumps(
|
|
1196
|
+
skylet_ssh_tunnel_metadata
|
|
1197
|
+
) if skylet_ssh_tunnel_metadata is not None else None
|
|
1198
|
+
count = session.query(cluster_table).filter_by(
|
|
1199
|
+
name=cluster_name).update(
|
|
1200
|
+
{cluster_table.c.skylet_ssh_tunnel_metadata: value})
|
|
1201
|
+
session.commit()
|
|
1202
|
+
assert count <= 1, count
|
|
1203
|
+
if count == 0:
|
|
1204
|
+
raise ValueError(f'Cluster {cluster_name} not found.')
|
|
1205
|
+
|
|
1206
|
+
|
|
1173
1207
|
@_init_db
|
|
1174
1208
|
@metrics_lib.time_me
|
|
1175
1209
|
def _get_cluster_usage_intervals(
|
sky/jobs/client/sdk_async.py
CHANGED
|
@@ -28,6 +28,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
28
28
|
async def launch(
|
|
29
29
|
task: Union['sky.Task', 'sky.Dag'],
|
|
30
30
|
name: Optional[str] = None,
|
|
31
|
+
pool: Optional[str] = None,
|
|
32
|
+
num_jobs: Optional[int] = None,
|
|
31
33
|
# Internal only:
|
|
32
34
|
# pylint: disable=invalid-name
|
|
33
35
|
_need_confirmation: bool = False,
|
|
@@ -35,8 +37,8 @@ async def launch(
|
|
|
35
37
|
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
|
|
36
38
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
37
39
|
"""Async version of launch() that launches a managed job."""
|
|
38
|
-
request_id = await context_utils.to_thread(sdk.launch, task, name,
|
|
39
|
-
_need_confirmation)
|
|
40
|
+
request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
|
|
41
|
+
num_jobs, _need_confirmation)
|
|
40
42
|
if stream_logs is not None:
|
|
41
43
|
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
42
44
|
else:
|
sky/jobs/controller.py
CHANGED
|
@@ -781,7 +781,7 @@ class JobsController:
|
|
|
781
781
|
class Controller:
|
|
782
782
|
"""Controller for managing jobs."""
|
|
783
783
|
|
|
784
|
-
def __init__(self):
|
|
784
|
+
def __init__(self) -> None:
|
|
785
785
|
# Global state for active jobs
|
|
786
786
|
self.job_tasks: Dict[int, asyncio.Task] = {}
|
|
787
787
|
self.starting: Set[int] = set()
|
|
@@ -984,12 +984,14 @@ class Controller:
|
|
|
984
984
|
job_logger.info(
|
|
985
985
|
f'Cluster of managed job {job_id} has been cleaned up.')
|
|
986
986
|
except Exception as e: # pylint: disable=broad-except
|
|
987
|
+
failure_reason = ('Failed to clean up: '
|
|
988
|
+
f'{common_utils.format_exception(e)}')
|
|
987
989
|
await managed_job_state.set_failed_async(
|
|
988
990
|
job_id,
|
|
989
991
|
task_id=None,
|
|
990
992
|
failure_type=managed_job_state.ManagedJobStatus.
|
|
991
993
|
FAILED_CONTROLLER,
|
|
992
|
-
failure_reason=
|
|
994
|
+
failure_reason=failure_reason,
|
|
993
995
|
override_terminal=True)
|
|
994
996
|
|
|
995
997
|
if cancelling:
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -543,7 +543,7 @@ class StrategyExecutor:
|
|
|
543
543
|
|
|
544
544
|
except exceptions.NoClusterLaunchedError:
|
|
545
545
|
# Update the status to PENDING during backoff.
|
|
546
|
-
state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
546
|
+
await state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
547
547
|
# Calculate the backoff time and sleep.
|
|
548
548
|
gap_seconds = (backoff.current_backoff()
|
|
549
549
|
if self.pool is None else 1)
|
sky/jobs/state.py
CHANGED
|
@@ -238,6 +238,7 @@ def _init_db_async(func):
|
|
|
238
238
|
last_exc = e
|
|
239
239
|
logger.debug(f'DB error: {last_exc}')
|
|
240
240
|
await asyncio.sleep(backoff.current_backoff())
|
|
241
|
+
assert last_exc is not None
|
|
241
242
|
raise last_exc
|
|
242
243
|
|
|
243
244
|
return wrapper
|
|
@@ -266,6 +267,7 @@ def _init_db(func):
|
|
|
266
267
|
last_exc = e
|
|
267
268
|
logger.debug(f'DB error: {last_exc}')
|
|
268
269
|
time.sleep(backoff.current_backoff())
|
|
270
|
+
assert last_exc is not None
|
|
269
271
|
raise last_exc
|
|
270
272
|
|
|
271
273
|
return wrapper
|
|
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
|
|
|
735
737
|
# Subquery to get the spot_job_ids that match the joined condition
|
|
736
738
|
subquery = session.query(spot_table.c.job_id).join(
|
|
737
739
|
job_info_table,
|
|
738
|
-
spot_table.c.spot_job_id == job_info_table.c.spot_job_id
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
740
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id
|
|
741
|
+
).filter(
|
|
742
|
+
spot_table.c.spot_job_id == job_id,
|
|
743
|
+
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
744
|
+
# Note: it's possible that a WAITING job actually needs to be
|
|
745
|
+
# cleaned up, if we are in the middle of an upgrade/recovery and
|
|
746
|
+
# the job is waiting to be reclaimed by a new controller. But,
|
|
747
|
+
# in this case the status will not be PENDING.
|
|
748
|
+
sqlalchemy.or_(
|
|
749
|
+
job_info_table.c.schedule_state ==
|
|
750
|
+
ManagedJobScheduleState.WAITING.value,
|
|
751
|
+
job_info_table.c.schedule_state ==
|
|
752
|
+
ManagedJobScheduleState.INACTIVE.value,
|
|
753
|
+
),
|
|
754
|
+
).subquery()
|
|
748
755
|
|
|
749
756
|
count = session.query(spot_table).filter(
|
|
750
757
|
spot_table.c.job_id.in_(subquery)).update(
|
|
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
|
|
|
1105
1112
|
"""Set the job id on the pool cluster for a job."""
|
|
1106
1113
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1107
1114
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1108
|
-
await session.execute(
|
|
1109
|
-
|
|
1115
|
+
await session.execute(
|
|
1116
|
+
sqlalchemy.update(job_info_table).
|
|
1117
|
+
where(job_info_table.c.spot_job_id == job_id).values({
|
|
1118
|
+
job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
|
|
1119
|
+
}))
|
|
1110
1120
|
await session.commit()
|
|
1111
1121
|
|
|
1112
1122
|
|
|
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
|
|
|
1130
1140
|
job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
1131
1141
|
"""Get the cluster name and job id on the pool from the managed job id."""
|
|
1132
1142
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1133
|
-
async with
|
|
1134
|
-
|
|
1143
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1144
|
+
result = await session.execute(
|
|
1135
1145
|
sqlalchemy.select(job_info_table.c.current_cluster_name,
|
|
1136
1146
|
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1137
|
-
job_info_table.c.spot_job_id == job_id)
|
|
1138
|
-
|
|
1147
|
+
job_info_table.c.spot_job_id == job_id))
|
|
1148
|
+
info = result.fetchone()
|
|
1139
1149
|
if info is None:
|
|
1140
1150
|
return None, None
|
|
1141
1151
|
return info[0], info[1]
|
sky/jobs/utils.py
CHANGED
|
@@ -586,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
|
586
586
|
raise
|
|
587
587
|
|
|
588
588
|
|
|
589
|
-
def event_callback_func(
|
|
589
|
+
def event_callback_func(
|
|
590
|
+
job_id: int, task_id: Optional[int],
|
|
591
|
+
task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
|
|
590
592
|
"""Run event callback for the task."""
|
|
591
593
|
|
|
592
594
|
def callback_func(status: str):
|
|
@@ -625,17 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
625
627
|
f'Bash:{event_callback},log_path:{log_path},result:{result}')
|
|
626
628
|
logger.info(f'=== END: event callback for {status!r} ===')
|
|
627
629
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
# In async context
|
|
632
|
-
async def async_callback_func(status: str):
|
|
633
|
-
return await context_utils.to_thread(callback_func, status)
|
|
630
|
+
async def async_callback_func(status: str):
|
|
631
|
+
return await context_utils.to_thread(callback_func, status)
|
|
634
632
|
|
|
635
|
-
|
|
636
|
-
except RuntimeError:
|
|
637
|
-
# Not in async context
|
|
638
|
-
return callback_func
|
|
633
|
+
return async_callback_func
|
|
639
634
|
|
|
640
635
|
|
|
641
636
|
# ======== user functions ========
|
sky/logs/agent.py
CHANGED
|
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
|
|
|
35
35
|
cluster_name: resources_utils.ClusterName) -> str:
|
|
36
36
|
install_cmd = (
|
|
37
37
|
'if ! command -v fluent-bit >/dev/null 2>&1; then '
|
|
38
|
-
'sudo apt-get install -y gnupg; '
|
|
38
|
+
'sudo apt-get update; sudo apt-get install -y gnupg; '
|
|
39
39
|
# pylint: disable=line-too-long
|
|
40
|
-
'curl https://
|
|
40
|
+
'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
|
|
41
|
+
# pylint: disable=line-too-long
|
|
42
|
+
'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
|
|
43
|
+
# pylint: disable=line-too-long
|
|
44
|
+
'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
|
|
45
|
+
# pylint: disable=line-too-long
|
|
46
|
+
'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
|
|
47
|
+
'sudo apt-get update; '
|
|
48
|
+
'sudo apt-get install -y fluent-bit; '
|
|
41
49
|
'fi')
|
|
42
50
|
cfg = self.fluentbit_config(cluster_name)
|
|
43
51
|
cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
|
|
@@ -3,7 +3,7 @@ import copy
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
from typing import Any, Dict, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from sky.adaptors import kubernetes
|
|
9
9
|
from sky.provision import common
|
|
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
|
|
|
666
666
|
|
|
667
667
|
|
|
668
668
|
class KubernetesError(Exception):
|
|
669
|
-
|
|
669
|
+
|
|
670
|
+
def __init__(self,
|
|
671
|
+
*args,
|
|
672
|
+
insufficent_resources: Optional[List[str]] = None):
|
|
673
|
+
self.insufficent_resources = insufficent_resources
|
|
674
|
+
super().__init__(*args)
|
|
@@ -3,6 +3,7 @@ import copy
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import time
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
191
192
|
break
|
|
192
193
|
if event_message is not None:
|
|
193
194
|
if pod_status == 'Pending':
|
|
194
|
-
|
|
195
|
+
out_of = {}
|
|
196
|
+
# key: resource name, value: (extra message, nice name)
|
|
195
197
|
if 'Insufficient cpu' in event_message:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
199
|
+
'custom-columns=NAME:.metadata.name,'
|
|
200
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
201
|
+
'the available CPUs on the node.', 'CPUs')
|
|
198
202
|
if 'Insufficient memory' in event_message:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
203
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
204
|
+
'custom-columns=NAME:.metadata.name,'
|
|
205
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
206
|
+
'to check the available memory on the '
|
|
207
|
+
'node.', 'Memory')
|
|
208
|
+
|
|
202
209
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
203
210
|
# fusermount-server, we need a new way to check whether the
|
|
204
211
|
# fusermount-server daemonset is ready.
|
|
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
206
213
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
207
214
|
for key in lf.get_label_keys()
|
|
208
215
|
]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
216
|
+
for label_key in gpu_lf_keys:
|
|
217
|
+
# TODO(romilb): We may have additional node
|
|
218
|
+
# affinity selectors in the future - in that
|
|
219
|
+
# case we will need to update this logic.
|
|
220
|
+
# TODO(Doyoung): Update the error message raised
|
|
221
|
+
# with the multi-host TPU support.
|
|
222
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
223
|
+
context) # pylint: disable=line-too-long
|
|
224
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
225
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
226
|
+
in event_message) and pod.spec.node_selector):
|
|
227
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
228
|
+
info_msg = (
|
|
229
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
230
|
+
'see the available GPUs.')
|
|
231
|
+
else:
|
|
232
|
+
info_msg = ': '
|
|
233
|
+
if (pod.spec.node_selector and
|
|
234
|
+
label_key in pod.spec.node_selector):
|
|
235
|
+
extra_msg = (
|
|
236
|
+
f'Verify if any node matching label '
|
|
237
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
238
|
+
f'sufficient resource {gpu_resource_key} '
|
|
239
|
+
f'is available in the cluster.')
|
|
240
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
241
|
+
else:
|
|
242
|
+
extra_msg = info_msg
|
|
243
|
+
if gpu_resource_key not in out_of or len(
|
|
244
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
245
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
246
|
+
|
|
247
|
+
if len(out_of) > 0:
|
|
248
|
+
# We are out of some resources. We should raise an error.
|
|
249
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
250
|
+
rsrc_err_msg += 'cluster:\n'
|
|
251
|
+
out_of_keys = list(out_of.keys())
|
|
252
|
+
for i in range(len(out_of_keys)):
|
|
253
|
+
rsrc = out_of_keys[i]
|
|
254
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
255
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
256
|
+
if i == len(out_of_keys) - 1:
|
|
257
|
+
indent = '└──'
|
|
258
|
+
else:
|
|
259
|
+
indent = '├──'
|
|
260
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
261
|
+
f'sufficient {nice_name} for your request'
|
|
262
|
+
f'{extra_msg}')
|
|
263
|
+
if i != len(out_of_keys) - 1:
|
|
264
|
+
rsrc_err_msg += '\n'
|
|
265
|
+
|
|
266
|
+
# Emit the error message without logging prefixes for better UX.
|
|
267
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
268
|
+
tmp_handler.flush = sys.stdout.flush
|
|
269
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
270
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
271
|
+
prev_propagate = logger.propagate
|
|
272
|
+
try:
|
|
273
|
+
logger.addHandler(tmp_handler)
|
|
274
|
+
logger.propagate = False
|
|
275
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
276
|
+
finally:
|
|
277
|
+
logger.removeHandler(tmp_handler)
|
|
278
|
+
logger.propagate = prev_propagate
|
|
279
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
280
|
+
raise config_lib.KubernetesError(
|
|
281
|
+
f'{timeout_err_msg} '
|
|
282
|
+
f'Pod status: {pod_status} '
|
|
283
|
+
f'Details: \'{event_message}\' ',
|
|
284
|
+
insufficent_resources=nice_names,
|
|
285
|
+
)
|
|
286
|
+
|
|
244
287
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
245
288
|
f'Pod status: {pod_status} '
|
|
246
289
|
f'Details: \'{event_message}\' ')
|
sky/provision/vast/instance.py
CHANGED
|
@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
|
|
|
39
39
|
|
|
40
40
|
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
41
41
|
for inst_id, inst in instances.items():
|
|
42
|
-
if inst['name'].endswith('-head'):
|
|
42
|
+
if inst.get('name') and inst['name'].endswith('-head'):
|
|
43
43
|
return inst_id
|
|
44
44
|
return None
|
|
45
45
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add skylet_ssh_tunnel_metadata to clusters.
|
|
2
|
+
|
|
3
|
+
Revision ID: 008
|
|
4
|
+
Revises: 007
|
|
5
|
+
Create Date: 2025-09-09
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '008'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '007'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add skylet_ssh_tunnel_metadata column to clusters."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('clusters',
|
|
27
|
+
'skylet_ssh_tunnel_metadata',
|
|
28
|
+
sa.LargeBinary(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade():
|
|
33
|
+
"""No-op for backward compatibility."""
|
|
34
|
+
pass
|
sky/server/config.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import Optional
|
|
|
6
6
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky.server import constants as server_constants
|
|
9
|
+
from sky.server import daemons
|
|
9
10
|
from sky.utils import common_utils
|
|
10
11
|
|
|
11
12
|
# Constants based on profiling the peak memory usage while serving various
|
|
@@ -21,7 +22,7 @@ from sky.utils import common_utils
|
|
|
21
22
|
# in the future.
|
|
22
23
|
# TODO(luca): The future is now! ^^^
|
|
23
24
|
LONG_WORKER_MEM_GB = 0.4
|
|
24
|
-
SHORT_WORKER_MEM_GB = 0.
|
|
25
|
+
SHORT_WORKER_MEM_GB = 0.3
|
|
25
26
|
# To control the number of long workers.
|
|
26
27
|
_CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
|
|
27
28
|
# Limit the number of long workers of local API server, since local server is
|
|
@@ -36,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
|
|
|
36
37
|
_MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
|
|
37
38
|
# Minimal number of long workers to ensure responsiveness.
|
|
38
39
|
_MIN_LONG_WORKERS = 1
|
|
39
|
-
# Minimal number of short workers
|
|
40
|
-
|
|
41
|
-
_MIN_SHORT_WORKERS = 2
|
|
40
|
+
# Minimal number of idle short workers to ensure responsiveness.
|
|
41
|
+
_MIN_IDLE_SHORT_WORKERS = 1
|
|
42
42
|
|
|
43
43
|
# Default number of burstable workers for local API server. A heuristic number
|
|
44
44
|
# that is large enough for most local cases.
|
|
@@ -216,6 +216,15 @@ def _max_long_worker_parallism(cpu_count: int,
|
|
|
216
216
|
return n
|
|
217
217
|
|
|
218
218
|
|
|
219
|
+
def _get_min_short_workers() -> int:
|
|
220
|
+
"""Min number of short workers."""
|
|
221
|
+
daemon_count = 0
|
|
222
|
+
for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
223
|
+
if not daemon.should_skip():
|
|
224
|
+
daemon_count += 1
|
|
225
|
+
return _MIN_IDLE_SHORT_WORKERS + daemon_count
|
|
226
|
+
|
|
227
|
+
|
|
219
228
|
def _max_short_worker_parallism(mem_size_gb: float,
|
|
220
229
|
long_worker_parallism: int) -> int:
|
|
221
230
|
"""Max parallelism for short workers."""
|
|
@@ -227,5 +236,5 @@ def _max_short_worker_parallism(mem_size_gb: float,
|
|
|
227
236
|
server_constants.MIN_AVAIL_MEM_GB)
|
|
228
237
|
reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
|
|
229
238
|
available_mem = max(0, mem_size_gb - reserved_mem)
|
|
230
|
-
n = max(
|
|
239
|
+
n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
|
|
231
240
|
return n
|
sky/server/metrics.py
CHANGED
|
@@ -4,6 +4,7 @@ import contextlib
|
|
|
4
4
|
import functools
|
|
5
5
|
import multiprocessing
|
|
6
6
|
import os
|
|
7
|
+
import threading
|
|
7
8
|
import time
|
|
8
9
|
|
|
9
10
|
import fastapi
|
|
@@ -21,6 +22,24 @@ from sky.skylet import constants
|
|
|
21
22
|
METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
|
|
22
23
|
'false').lower() == 'true'
|
|
23
24
|
|
|
25
|
+
_KB = 2**10
|
|
26
|
+
_MB = 2**20
|
|
27
|
+
_MEM_BUCKETS = [
|
|
28
|
+
_KB,
|
|
29
|
+
256 * _KB,
|
|
30
|
+
512 * _KB,
|
|
31
|
+
_MB,
|
|
32
|
+
2 * _MB,
|
|
33
|
+
4 * _MB,
|
|
34
|
+
8 * _MB,
|
|
35
|
+
16 * _MB,
|
|
36
|
+
32 * _MB,
|
|
37
|
+
64 * _MB,
|
|
38
|
+
128 * _MB,
|
|
39
|
+
256 * _MB,
|
|
40
|
+
float('inf'),
|
|
41
|
+
]
|
|
42
|
+
|
|
24
43
|
logger = sky_logging.init_logger(__name__)
|
|
25
44
|
|
|
26
45
|
# Total number of API server requests, grouped by path, method, and status.
|
|
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
|
|
|
92
111
|
['pid', 'type', 'mode'],
|
|
93
112
|
)
|
|
94
113
|
|
|
114
|
+
SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
|
|
115
|
+
'sky_apiserver_request_memory_usage_bytes',
|
|
116
|
+
'Peak memory usage of requests', ['name'],
|
|
117
|
+
buckets=_MEM_BUCKETS)
|
|
118
|
+
|
|
119
|
+
SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
|
|
120
|
+
'sky_apiserver_request_rss_incr_bytes',
|
|
121
|
+
'RSS increment after requests', ['name'],
|
|
122
|
+
buckets=_MEM_BUCKETS)
|
|
123
|
+
|
|
95
124
|
metrics_app = fastapi.FastAPI()
|
|
96
125
|
|
|
97
126
|
|
|
@@ -208,19 +237,23 @@ def time_me_async(func):
|
|
|
208
237
|
return async_wrapper
|
|
209
238
|
|
|
210
239
|
|
|
211
|
-
|
|
240
|
+
peak_rss_bytes = 0
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def process_monitor(process_type: str, stop: threading.Event):
|
|
212
244
|
pid = multiprocessing.current_process().pid
|
|
213
245
|
proc = psutil.Process(pid)
|
|
214
|
-
peak_rss = 0
|
|
215
246
|
last_bucket_end = time.time()
|
|
216
|
-
|
|
247
|
+
bucket_peak = 0
|
|
248
|
+
global peak_rss_bytes
|
|
249
|
+
while not stop.is_set():
|
|
217
250
|
if time.time() - last_bucket_end >= 30:
|
|
218
|
-
# Reset peak RSS
|
|
251
|
+
# Reset peak RSS for the next time bucket.
|
|
219
252
|
last_bucket_end = time.time()
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
223
|
-
|
|
253
|
+
bucket_peak = 0
|
|
254
|
+
peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
|
|
255
|
+
SKY_APISERVER_PROCESS_PEAK_RSS.labels(
|
|
256
|
+
pid=pid, type=process_type).set(peak_rss_bytes)
|
|
224
257
|
ctimes = proc.cpu_times()
|
|
225
258
|
SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
|
|
226
259
|
type=process_type,
|