skypilot-nightly 1.0.0.dev20250910__py3-none-any.whl → 1.0.0.dev20250912__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +125 -22
  3. sky/backends/cloud_vm_ray_backend.py +224 -72
  4. sky/catalog/__init__.py +7 -0
  5. sky/catalog/aws_catalog.py +4 -0
  6. sky/catalog/common.py +18 -0
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +2 -71
  9. sky/client/sdk_async.py +5 -2
  10. sky/clouds/aws.py +23 -5
  11. sky/clouds/cloud.py +8 -0
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/{webpack-1d7e11230da3ca89.js → webpack-e8a0c4c3c6f408fb.js} +1 -1
  15. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  16. sky/dashboard/out/clusters/[cluster].html +1 -1
  17. sky/dashboard/out/clusters.html +1 -1
  18. sky/dashboard/out/config.html +1 -1
  19. sky/dashboard/out/index.html +1 -1
  20. sky/dashboard/out/infra/[context].html +1 -1
  21. sky/dashboard/out/infra.html +1 -1
  22. sky/dashboard/out/jobs/[job].html +1 -1
  23. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/global_user_state.py +34 -0
  31. sky/jobs/client/sdk_async.py +4 -2
  32. sky/jobs/controller.py +4 -2
  33. sky/jobs/recovery_strategy.py +1 -1
  34. sky/jobs/state.py +26 -16
  35. sky/jobs/utils.py +6 -11
  36. sky/logs/agent.py +10 -2
  37. sky/provision/kubernetes/config.py +7 -2
  38. sky/provision/kubernetes/instance.py +84 -41
  39. sky/provision/vast/instance.py +1 -1
  40. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  41. sky/server/config.py +14 -5
  42. sky/server/metrics.py +41 -8
  43. sky/server/requests/executor.py +41 -4
  44. sky/server/server.py +1 -0
  45. sky/server/uvicorn.py +11 -5
  46. sky/skylet/constants.py +12 -7
  47. sky/skylet/log_lib.py +11 -0
  48. sky/skylet/log_lib.pyi +9 -0
  49. sky/task.py +62 -0
  50. sky/templates/kubernetes-ray.yml.j2 +120 -3
  51. sky/utils/accelerator_registry.py +3 -1
  52. sky/utils/command_runner.py +35 -11
  53. sky/utils/command_runner.pyi +22 -0
  54. sky/utils/context_utils.py +15 -2
  55. sky/utils/db/migration_utils.py +1 -1
  56. sky/utils/git.py +559 -1
  57. sky/utils/resource_checker.py +8 -7
  58. sky/workspaces/core.py +57 -21
  59. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/METADATA +33 -33
  60. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/RECORD +66 -66
  61. sky/client/cli/git.py +0 -549
  62. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  63. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_buildManifest.js +0 -0
  64. /sky/dashboard/out/_next/static/{3SYxqNGnvvPS8h3gdD2T7 → DAiq7V2xJnO1LSfmunZl6}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/WHEEL +0 -0
  66. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/entry_points.txt +0 -0
  67. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/licenses/LICENSE +0 -0
  68. {skypilot_nightly-1.0.0.dev20250910.dist-info → skypilot_nightly-1.0.0.dev20250912.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1d7e11230da3ca89.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-86cabed5d4669ad0.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"3SYxqNGnvvPS8h3gdD2T7","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1836-37fede578e2da5f8.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-0487dfbf149d9e53.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-408ed10b2f9fce17.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-86cabed5d4669ad0.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-af76bb06dbb3954f.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1d7e11230da3ca89.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/3SYxqNGnvvPS8h3gdD2T7/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"3SYxqNGnvvPS8h3gdD2T7","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-e8a0c4c3c6f408fb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-7598c33a746cdc91.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/DAiq7V2xJnO1LSfmunZl6/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"DAiq7V2xJnO1LSfmunZl6","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -118,6 +118,9 @@ cluster_table = sqlalchemy.Table(
118
118
  sqlalchemy.Column('provision_log_path',
119
119
  sqlalchemy.Text,
120
120
  server_default=None),
121
+ sqlalchemy.Column('skylet_ssh_tunnel_metadata',
122
+ sqlalchemy.LargeBinary,
123
+ server_default=None),
121
124
  )
122
125
 
123
126
  storage_table = sqlalchemy.Table(
@@ -1170,6 +1173,37 @@ def set_cluster_storage_mounts_metadata(
1170
1173
  raise ValueError(f'Cluster {cluster_name} not found.')
1171
1174
 
1172
1175
 
1176
+ @_init_db
1177
+ @metrics_lib.time_me
1178
+ def get_cluster_skylet_ssh_tunnel_metadata(
1179
+ cluster_name: str) -> Optional[Tuple[int, int]]:
1180
+ assert _SQLALCHEMY_ENGINE is not None
1181
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1182
+ row = session.query(cluster_table).filter_by(name=cluster_name).first()
1183
+ if row is None or row.skylet_ssh_tunnel_metadata is None:
1184
+ return None
1185
+ return pickle.loads(row.skylet_ssh_tunnel_metadata)
1186
+
1187
+
1188
+ @_init_db
1189
+ @metrics_lib.time_me
1190
+ def set_cluster_skylet_ssh_tunnel_metadata(
1191
+ cluster_name: str,
1192
+ skylet_ssh_tunnel_metadata: Optional[Tuple[int, int]]) -> None:
1193
+ assert _SQLALCHEMY_ENGINE is not None
1194
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1195
+ value = pickle.dumps(
1196
+ skylet_ssh_tunnel_metadata
1197
+ ) if skylet_ssh_tunnel_metadata is not None else None
1198
+ count = session.query(cluster_table).filter_by(
1199
+ name=cluster_name).update(
1200
+ {cluster_table.c.skylet_ssh_tunnel_metadata: value})
1201
+ session.commit()
1202
+ assert count <= 1, count
1203
+ if count == 0:
1204
+ raise ValueError(f'Cluster {cluster_name} not found.')
1205
+
1206
+
1173
1207
  @_init_db
1174
1208
  @metrics_lib.time_me
1175
1209
  def _get_cluster_usage_intervals(
@@ -28,6 +28,8 @@ logger = sky_logging.init_logger(__name__)
28
28
  async def launch(
29
29
  task: Union['sky.Task', 'sky.Dag'],
30
30
  name: Optional[str] = None,
31
+ pool: Optional[str] = None,
32
+ num_jobs: Optional[int] = None,
31
33
  # Internal only:
32
34
  # pylint: disable=invalid-name
33
35
  _need_confirmation: bool = False,
@@ -35,8 +37,8 @@ async def launch(
35
37
  sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG,
36
38
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
37
39
  """Async version of launch() that launches a managed job."""
38
- request_id = await context_utils.to_thread(sdk.launch, task, name,
39
- _need_confirmation)
40
+ request_id = await context_utils.to_thread(sdk.launch, task, name, pool,
41
+ num_jobs, _need_confirmation)
40
42
  if stream_logs is not None:
41
43
  return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
42
44
  else:
sky/jobs/controller.py CHANGED
@@ -781,7 +781,7 @@ class JobsController:
781
781
  class Controller:
782
782
  """Controller for managing jobs."""
783
783
 
784
- def __init__(self):
784
+ def __init__(self) -> None:
785
785
  # Global state for active jobs
786
786
  self.job_tasks: Dict[int, asyncio.Task] = {}
787
787
  self.starting: Set[int] = set()
@@ -984,12 +984,14 @@ class Controller:
984
984
  job_logger.info(
985
985
  f'Cluster of managed job {job_id} has been cleaned up.')
986
986
  except Exception as e: # pylint: disable=broad-except
987
+ failure_reason = ('Failed to clean up: '
988
+ f'{common_utils.format_exception(e)}')
987
989
  await managed_job_state.set_failed_async(
988
990
  job_id,
989
991
  task_id=None,
990
992
  failure_type=managed_job_state.ManagedJobStatus.
991
993
  FAILED_CONTROLLER,
992
- failure_reason=e,
994
+ failure_reason=failure_reason,
993
995
  override_terminal=True)
994
996
 
995
997
  if cancelling:
@@ -543,7 +543,7 @@ class StrategyExecutor:
543
543
 
544
544
  except exceptions.NoClusterLaunchedError:
545
545
  # Update the status to PENDING during backoff.
546
- state.set_backoff_pending_async(self.job_id, self.task_id)
546
+ await state.set_backoff_pending_async(self.job_id, self.task_id)
547
547
  # Calculate the backoff time and sleep.
548
548
  gap_seconds = (backoff.current_backoff()
549
549
  if self.pool is None else 1)
sky/jobs/state.py CHANGED
@@ -238,6 +238,7 @@ def _init_db_async(func):
238
238
  last_exc = e
239
239
  logger.debug(f'DB error: {last_exc}')
240
240
  await asyncio.sleep(backoff.current_backoff())
241
+ assert last_exc is not None
241
242
  raise last_exc
242
243
 
243
244
  return wrapper
@@ -266,6 +267,7 @@ def _init_db(func):
266
267
  last_exc = e
267
268
  logger.debug(f'DB error: {last_exc}')
268
269
  time.sleep(backoff.current_backoff())
270
+ assert last_exc is not None
269
271
  raise last_exc
270
272
 
271
273
  return wrapper
@@ -735,16 +737,21 @@ def set_pending_cancelled(job_id: int):
735
737
  # Subquery to get the spot_job_ids that match the joined condition
736
738
  subquery = session.query(spot_table.c.job_id).join(
737
739
  job_info_table,
738
- spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
739
- spot_table.c.spot_job_id == job_id,
740
- spot_table.c.status == ManagedJobStatus.PENDING.value,
741
- sqlalchemy.or_(
742
- job_info_table.c.schedule_state ==
743
- ManagedJobScheduleState.WAITING.value,
744
- job_info_table.c.schedule_state ==
745
- ManagedJobScheduleState.INACTIVE.value,
746
- ),
747
- ).subquery()
740
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id
741
+ ).filter(
742
+ spot_table.c.spot_job_id == job_id,
743
+ spot_table.c.status == ManagedJobStatus.PENDING.value,
744
+ # Note: it's possible that a WAITING job actually needs to be
745
+ # cleaned up, if we are in the middle of an upgrade/recovery and
746
+ # the job is waiting to be reclaimed by a new controller. But,
747
+ # in this case the status will not be PENDING.
748
+ sqlalchemy.or_(
749
+ job_info_table.c.schedule_state ==
750
+ ManagedJobScheduleState.WAITING.value,
751
+ job_info_table.c.schedule_state ==
752
+ ManagedJobScheduleState.INACTIVE.value,
753
+ ),
754
+ ).subquery()
748
755
 
749
756
  count = session.query(spot_table).filter(
750
757
  spot_table.c.job_id.in_(subquery)).update(
@@ -1105,8 +1112,11 @@ async def set_job_id_on_pool_cluster_async(job_id: int,
1105
1112
  """Set the job id on the pool cluster for a job."""
1106
1113
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
1107
1114
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1108
- await session.execute(job_info_table.c.spot_job_id == job_id).update(
1109
- {job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
1115
+ await session.execute(
1116
+ sqlalchemy.update(job_info_table).
1117
+ where(job_info_table.c.spot_job_id == job_id).values({
1118
+ job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
1119
+ }))
1110
1120
  await session.commit()
1111
1121
 
1112
1122
 
@@ -1130,12 +1140,12 @@ async def get_pool_submit_info_async(
1130
1140
  job_id: int) -> Tuple[Optional[str], Optional[int]]:
1131
1141
  """Get the cluster name and job id on the pool from the managed job id."""
1132
1142
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
1133
- async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
1134
- info = await session.execute(
1143
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1144
+ result = await session.execute(
1135
1145
  sqlalchemy.select(job_info_table.c.current_cluster_name,
1136
1146
  job_info_table.c.job_id_on_pool_cluster).where(
1137
- job_info_table.c.spot_job_id == job_id)
1138
- ).fetchone()
1147
+ job_info_table.c.spot_job_id == job_id))
1148
+ info = result.fetchone()
1139
1149
  if info is None:
1140
1150
  return None, None
1141
1151
  return info[0], info[1]
sky/jobs/utils.py CHANGED
@@ -586,7 +586,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
586
586
  raise
587
587
 
588
588
 
589
- def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
589
+ def event_callback_func(
590
+ job_id: int, task_id: Optional[int],
591
+ task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
590
592
  """Run event callback for the task."""
591
593
 
592
594
  def callback_func(status: str):
@@ -625,17 +627,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
625
627
  f'Bash:{event_callback},log_path:{log_path},result:{result}')
626
628
  logger.info(f'=== END: event callback for {status!r} ===')
627
629
 
628
- try:
629
- asyncio.get_running_loop()
630
-
631
- # In async context
632
- async def async_callback_func(status: str):
633
- return await context_utils.to_thread(callback_func, status)
630
+ async def async_callback_func(status: str):
631
+ return await context_utils.to_thread(callback_func, status)
634
632
 
635
- return async_callback_func
636
- except RuntimeError:
637
- # Not in async context
638
- return callback_func
633
+ return async_callback_func
639
634
 
640
635
 
641
636
  # ======== user functions ========
sky/logs/agent.py CHANGED
@@ -35,9 +35,17 @@ class FluentbitAgent(LoggingAgent):
35
35
  cluster_name: resources_utils.ClusterName) -> str:
36
36
  install_cmd = (
37
37
  'if ! command -v fluent-bit >/dev/null 2>&1; then '
38
- 'sudo apt-get install -y gnupg; '
38
+ 'sudo apt-get update; sudo apt-get install -y gnupg; '
39
39
  # pylint: disable=line-too-long
40
- 'curl https://raw.githubusercontent.com/fluent/fluent-bit/master/install.sh | sh; '
40
+ 'sudo sh -c \'curl https://packages.fluentbit.io/fluentbit.key | gpg --dearmor > /usr/share/keyrings/fluentbit-keyring.gpg\'; '
41
+ # pylint: disable=line-too-long
42
+ 'os_id=$(grep -oP \'(?<=^ID=).*\' /etc/os-release 2>/dev/null || lsb_release -is 2>/dev/null | tr \'[:upper:]\' \'[:lower:]\'); '
43
+ # pylint: disable=line-too-long
44
+ 'codename=$(grep -oP \'(?<=VERSION_CODENAME=).*\' /etc/os-release 2>/dev/null || lsb_release -cs 2>/dev/null); '
45
+ # pylint: disable=line-too-long
46
+ 'echo "deb [signed-by=/usr/share/keyrings/fluentbit-keyring.gpg] https://packages.fluentbit.io/$os_id/$codename $codename main" | sudo tee /etc/apt/sources.list.d/fluent-bit.list; '
47
+ 'sudo apt-get update; '
48
+ 'sudo apt-get install -y fluent-bit; '
41
49
  'fi')
42
50
  cfg = self.fluentbit_config(cluster_name)
43
51
  cfg_path = os.path.join(constants.LOGGING_CONFIG_DIR, 'fluentbit.yaml')
@@ -3,7 +3,7 @@ import copy
3
3
  import logging
4
4
  import math
5
5
  import os
6
- from typing import Any, Dict, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from sky.adaptors import kubernetes
9
9
  from sky.provision import common
@@ -666,4 +666,9 @@ def _configure_services(namespace: str, context: Optional[str],
666
666
 
667
667
 
668
668
  class KubernetesError(Exception):
669
- pass
669
+
670
+ def __init__(self,
671
+ *args,
672
+ insufficent_resources: Optional[List[str]] = None):
673
+ self.insufficent_resources = insufficent_resources
674
+ super().__init__(*args)
@@ -3,6 +3,7 @@ import copy
3
3
  import datetime
4
4
  import json
5
5
  import re
6
+ import sys
6
7
  import time
7
8
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
@@ -191,14 +192,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
191
192
  break
192
193
  if event_message is not None:
193
194
  if pod_status == 'Pending':
194
- logger.info(event_message)
195
+ out_of = {}
196
+ # key: resource name, value: (extra message, nice name)
195
197
  if 'Insufficient cpu' in event_message:
196
- raise config_lib.KubernetesError(
197
- _lack_resource_msg('CPU', pod, details=event_message))
198
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
199
+ 'custom-columns=NAME:.metadata.name,'
200
+ 'CPU:.status.allocatable.cpu\' to check '
201
+ 'the available CPUs on the node.', 'CPUs')
198
202
  if 'Insufficient memory' in event_message:
199
- raise config_lib.KubernetesError(
200
- _lack_resource_msg('memory', pod,
201
- details=event_message))
203
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
204
+ 'custom-columns=NAME:.metadata.name,'
205
+ 'MEMORY:.status.allocatable.memory\' '
206
+ 'to check the available memory on the '
207
+ 'node.', 'Memory')
208
+
202
209
  # TODO(aylei): after switching from smarter-device-manager to
203
210
  # fusermount-server, we need a new way to check whether the
204
211
  # fusermount-server daemonset is ready.
@@ -206,41 +213,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
206
213
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
207
214
  for key in lf.get_label_keys()
208
215
  ]
209
- if pod.spec.node_selector:
210
- for label_key in pod.spec.node_selector.keys():
211
- if label_key in gpu_lf_keys:
212
- # TODO(romilb): We may have additional node
213
- # affinity selectors in the future - in that
214
- # case we will need to update this logic.
215
- # TODO(Doyoung): Update the error message raised
216
- # with the multi-host TPU support.
217
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
218
- if 'Insufficient google.com/tpu' in event_message:
219
- extra_msg = (
220
- f'Verify if '
221
- f'{pod.spec.node_selector[label_key]}'
222
- ' is available in the cluster. Note '
223
- 'that multi-host TPU podslices are '
224
- 'currently not unsupported.')
225
- raise config_lib.KubernetesError(
226
- _lack_resource_msg('TPU',
227
- pod,
228
- extra_msg,
229
- details=event_message))
230
- elif ((f'Insufficient {gpu_resource_key}'
231
- in event_message) or
232
- ('didn\'t match Pod\'s node affinity/selector'
233
- in event_message)):
234
- extra_msg = (
235
- f'Verify if any node matching label '
236
- f'{pod.spec.node_selector[label_key]} and '
237
- f'sufficient resource {gpu_resource_key} '
238
- f'is available in the cluster.')
239
- raise config_lib.KubernetesError(
240
- _lack_resource_msg('GPU',
241
- pod,
242
- extra_msg,
243
- details=event_message))
216
+ for label_key in gpu_lf_keys:
217
+ # TODO(romilb): We may have additional node
218
+ # affinity selectors in the future - in that
219
+ # case we will need to update this logic.
220
+ # TODO(Doyoung): Update the error message raised
221
+ # with the multi-host TPU support.
222
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
223
+ context) # pylint: disable=line-too-long
224
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
225
+ ('didn\'t match Pod\'s node affinity/selector'
226
+ in event_message) and pod.spec.node_selector):
227
+ if 'gpu' in gpu_resource_key.lower():
228
+ info_msg = (
229
+ ': Run \'sky show-gpus --infra kubernetes\' to '
230
+ 'see the available GPUs.')
231
+ else:
232
+ info_msg = ': '
233
+ if (pod.spec.node_selector and
234
+ label_key in pod.spec.node_selector):
235
+ extra_msg = (
236
+ f'Verify if any node matching label '
237
+ f'{pod.spec.node_selector[label_key]} and '
238
+ f'sufficient resource {gpu_resource_key} '
239
+ f'is available in the cluster.')
240
+ extra_msg = info_msg + ' ' + extra_msg
241
+ else:
242
+ extra_msg = info_msg
243
+ if gpu_resource_key not in out_of or len(
244
+ out_of[gpu_resource_key][0]) < len(extra_msg):
245
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
246
+
247
+ if len(out_of) > 0:
248
+ # We are out of some resources. We should raise an error.
249
+ rsrc_err_msg = 'Insufficient resource capacity on the '
250
+ rsrc_err_msg += 'cluster:\n'
251
+ out_of_keys = list(out_of.keys())
252
+ for i in range(len(out_of_keys)):
253
+ rsrc = out_of_keys[i]
254
+ (extra_msg, nice_name) = out_of[rsrc]
255
+ extra_msg = extra_msg if extra_msg else ''
256
+ if i == len(out_of_keys) - 1:
257
+ indent = '└──'
258
+ else:
259
+ indent = '├──'
260
+ rsrc_err_msg += (f'{indent} Cluster does not have '
261
+ f'sufficient {nice_name} for your request'
262
+ f'{extra_msg}')
263
+ if i != len(out_of_keys) - 1:
264
+ rsrc_err_msg += '\n'
265
+
266
+ # Emit the error message without logging prefixes for better UX.
267
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
268
+ tmp_handler.flush = sys.stdout.flush
269
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
270
+ tmp_handler.setLevel(sky_logging.ERROR)
271
+ prev_propagate = logger.propagate
272
+ try:
273
+ logger.addHandler(tmp_handler)
274
+ logger.propagate = False
275
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
276
+ finally:
277
+ logger.removeHandler(tmp_handler)
278
+ logger.propagate = prev_propagate
279
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
280
+ raise config_lib.KubernetesError(
281
+ f'{timeout_err_msg} '
282
+ f'Pod status: {pod_status} '
283
+ f'Details: \'{event_message}\' ',
284
+ insufficent_resources=nice_names,
285
+ )
286
+
244
287
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
245
288
  f'Pod status: {pod_status} '
246
289
  f'Details: \'{event_message}\' ')
@@ -39,7 +39,7 @@ def _filter_instances(cluster_name_on_cloud: str,
39
39
 
40
40
  def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
41
  for inst_id, inst in instances.items():
42
- if inst['name'].endswith('-head'):
42
+ if inst.get('name') and inst['name'].endswith('-head'):
43
43
  return inst_id
44
44
  return None
45
45
 
@@ -0,0 +1,34 @@
1
+ """Add skylet_ssh_tunnel_metadata to clusters.
2
+
3
+ Revision ID: 008
4
+ Revises: 007
5
+ Create Date: 2025-09-09
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+ import sqlalchemy as sa
13
+
14
+ from sky.utils.db import db_utils
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = '008'
18
+ down_revision: Union[str, Sequence[str], None] = '007'
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade():
24
+ """Add skylet_ssh_tunnel_metadata column to clusters."""
25
+ with op.get_context().autocommit_block():
26
+ db_utils.add_column_to_table_alembic('clusters',
27
+ 'skylet_ssh_tunnel_metadata',
28
+ sa.LargeBinary(),
29
+ server_default=None)
30
+
31
+
32
+ def downgrade():
33
+ """No-op for backward compatibility."""
34
+ pass
sky/server/config.py CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
6
6
 
7
7
  from sky import sky_logging
8
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
9
10
  from sky.utils import common_utils
10
11
 
11
12
  # Constants based on profiling the peak memory usage while serving various
@@ -21,7 +22,7 @@ from sky.utils import common_utils
21
22
  # in the future.
22
23
  # TODO(luca): The future is now! ^^^
23
24
  LONG_WORKER_MEM_GB = 0.4
24
- SHORT_WORKER_MEM_GB = 0.25
25
+ SHORT_WORKER_MEM_GB = 0.3
25
26
  # To control the number of long workers.
26
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
27
28
  # Limit the number of long workers of local API server, since local server is
@@ -36,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
36
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
37
38
  # Minimal number of long workers to ensure responsiveness.
38
39
  _MIN_LONG_WORKERS = 1
39
- # Minimal number of short workers, there is a daemon task running on short
40
- # workers so at least 2 workers are needed to ensure responsiveness.
41
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
42
42
 
43
43
  # Default number of burstable workers for local API server. A heuristic number
44
44
  # that is large enough for most local cases.
@@ -216,6 +216,15 @@ def _max_long_worker_parallism(cpu_count: int,
216
216
  return n
217
217
 
218
218
 
219
+ def _get_min_short_workers() -> int:
220
+ """Min number of short workers."""
221
+ daemon_count = 0
222
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
223
+ if not daemon.should_skip():
224
+ daemon_count += 1
225
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
226
+
227
+
219
228
  def _max_short_worker_parallism(mem_size_gb: float,
220
229
  long_worker_parallism: int) -> int:
221
230
  """Max parallelism for short workers."""
@@ -227,5 +236,5 @@ def _max_short_worker_parallism(mem_size_gb: float,
227
236
  server_constants.MIN_AVAIL_MEM_GB)
228
237
  reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
229
238
  available_mem = max(0, mem_size_gb - reserved_mem)
230
- n = max(_MIN_SHORT_WORKERS, int(available_mem / SHORT_WORKER_MEM_GB))
239
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
231
240
  return n
sky/server/metrics.py CHANGED
@@ -4,6 +4,7 @@ import contextlib
4
4
  import functools
5
5
  import multiprocessing
6
6
  import os
7
+ import threading
7
8
  import time
8
9
 
9
10
  import fastapi
@@ -21,6 +22,24 @@ from sky.skylet import constants
21
22
  METRICS_ENABLED = os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED,
22
23
  'false').lower() == 'true'
23
24
 
25
+ _KB = 2**10
26
+ _MB = 2**20
27
+ _MEM_BUCKETS = [
28
+ _KB,
29
+ 256 * _KB,
30
+ 512 * _KB,
31
+ _MB,
32
+ 2 * _MB,
33
+ 4 * _MB,
34
+ 8 * _MB,
35
+ 16 * _MB,
36
+ 32 * _MB,
37
+ 64 * _MB,
38
+ 128 * _MB,
39
+ 256 * _MB,
40
+ float('inf'),
41
+ ]
42
+
24
43
  logger = sky_logging.init_logger(__name__)
25
44
 
26
45
  # Total number of API server requests, grouped by path, method, and status.
@@ -92,6 +111,16 @@ SKY_APISERVER_PROCESS_CPU_TOTAL = prom.Gauge(
92
111
  ['pid', 'type', 'mode'],
93
112
  )
94
113
 
114
+ SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES = prom.Histogram(
115
+ 'sky_apiserver_request_memory_usage_bytes',
116
+ 'Peak memory usage of requests', ['name'],
117
+ buckets=_MEM_BUCKETS)
118
+
119
+ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
120
+ 'sky_apiserver_request_rss_incr_bytes',
121
+ 'RSS increment after requests', ['name'],
122
+ buckets=_MEM_BUCKETS)
123
+
95
124
  metrics_app = fastapi.FastAPI()
96
125
 
97
126
 
@@ -208,19 +237,23 @@ def time_me_async(func):
208
237
  return async_wrapper
209
238
 
210
239
 
211
- def process_monitor(process_type: str):
240
+ peak_rss_bytes = 0
241
+
242
+
243
+ def process_monitor(process_type: str, stop: threading.Event):
212
244
  pid = multiprocessing.current_process().pid
213
245
  proc = psutil.Process(pid)
214
- peak_rss = 0
215
246
  last_bucket_end = time.time()
216
- while True:
247
+ bucket_peak = 0
248
+ global peak_rss_bytes
249
+ while not stop.is_set():
217
250
  if time.time() - last_bucket_end >= 30:
218
- # Reset peak RSS every 30 seconds.
251
+ # Reset peak RSS for the next time bucket.
219
252
  last_bucket_end = time.time()
220
- peak_rss = 0
221
- peak_rss = max(peak_rss, proc.memory_info().rss)
222
- SKY_APISERVER_PROCESS_PEAK_RSS.labels(pid=pid,
223
- type=process_type).set(peak_rss)
253
+ bucket_peak = 0
254
+ peak_rss_bytes = max(bucket_peak, proc.memory_info().rss)
255
+ SKY_APISERVER_PROCESS_PEAK_RSS.labels(
256
+ pid=pid, type=process_type).set(peak_rss_bytes)
224
257
  ctimes = proc.cpu_times()
225
258
  SKY_APISERVER_PROCESS_CPU_TOTAL.labels(pid=pid,
226
259
  type=process_type,