skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (136) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend.py +5 -3
  4. sky/backends/backend_utils.py +22 -7
  5. sky/backends/cloud_vm_ray_backend.py +50 -18
  6. sky/backends/local_docker_backend.py +8 -3
  7. sky/client/cli/command.py +25 -10
  8. sky/client/sdk.py +51 -1
  9. sky/clouds/kubernetes.py +2 -6
  10. sky/clouds/nebius.py +3 -1
  11. sky/core.py +9 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  19. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  20. sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  23. sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  27. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
  28. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  29. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  32. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
  47. sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
  48. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  49. sky/dashboard/out/clusters/[cluster].html +1 -1
  50. sky/dashboard/out/clusters.html +1 -1
  51. sky/dashboard/out/config.html +1 -1
  52. sky/dashboard/out/index.html +1 -1
  53. sky/dashboard/out/infra/[context].html +1 -1
  54. sky/dashboard/out/infra.html +1 -1
  55. sky/dashboard/out/jobs/[job].html +1 -1
  56. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/volumes.html +1 -1
  60. sky/dashboard/out/workspace/new.html +1 -1
  61. sky/dashboard/out/workspaces/[name].html +1 -1
  62. sky/dashboard/out/workspaces.html +1 -1
  63. sky/data/storage_utils.py +29 -9
  64. sky/execution.py +13 -10
  65. sky/global_user_state.py +131 -2
  66. sky/jobs/constants.py +1 -1
  67. sky/jobs/recovery_strategy.py +0 -3
  68. sky/jobs/scheduler.py +14 -21
  69. sky/jobs/server/core.py +64 -10
  70. sky/jobs/server/utils.py +1 -1
  71. sky/jobs/state.py +1 -3
  72. sky/jobs/utils.py +159 -11
  73. sky/provision/aws/config.py +19 -3
  74. sky/provision/aws/instance.py +2 -1
  75. sky/provision/kubernetes/instance.py +2 -1
  76. sky/provision/nebius/utils.py +101 -86
  77. sky/provision/provisioner.py +13 -8
  78. sky/resources.py +5 -5
  79. sky/schemas/api/responses.py +50 -1
  80. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  81. sky/serve/replica_managers.py +123 -101
  82. sky/serve/serve_state.py +32 -0
  83. sky/serve/serve_utils.py +37 -16
  84. sky/serve/service.py +51 -17
  85. sky/server/common.py +2 -3
  86. sky/server/constants.py +1 -1
  87. sky/server/requests/payloads.py +6 -0
  88. sky/server/requests/serializers/decoders.py +20 -5
  89. sky/server/requests/serializers/encoders.py +21 -8
  90. sky/server/server.py +57 -11
  91. sky/templates/kubernetes-ray.yml.j2 +1 -0
  92. sky/utils/cli_utils/status_utils.py +2 -1
  93. sky/utils/common_utils.py +20 -0
  94. sky/utils/controller_utils.py +17 -4
  95. sky/utils/db/migration_utils.py +1 -1
  96. sky/utils/log_utils.py +14 -5
  97. sky/utils/resources_utils.py +25 -1
  98. sky/utils/schemas.py +3 -0
  99. sky/utils/ux_utils.py +36 -5
  100. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
  102. sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  104. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  105. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  106. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  109. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  110. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  111. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  114. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  117. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  119. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
  121. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
  131. /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
  132. /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
  133. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '58649973a7c706775528a419f46ae024e59f4603'
8
+ _SKYPILOT_COMMIT_SHA = 'bff0c2a2d33d0990092c7c33a532359ffe1b6c56'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250814'
38
+ __version__ = '1.0.0.dev20250816'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/nebius.py CHANGED
@@ -1,7 +1,8 @@
1
1
  """Nebius cloud adaptor."""
2
+ import asyncio
2
3
  import os
3
4
  import threading
4
- from typing import List, Optional
5
+ from typing import Any, Awaitable, List, Optional
5
6
 
6
7
  from sky import sky_logging
7
8
  from sky import skypilot_config
@@ -9,8 +10,49 @@ from sky.adaptors import common
9
10
  from sky.utils import annotations
10
11
  from sky.utils import ux_utils
11
12
 
13
+ # Default read timeout for nebius SDK
14
+ READ_TIMEOUT = 10
15
+
12
16
  logger = sky_logging.init_logger(__name__)
13
17
 
18
+ _loop_lock = threading.Lock()
19
+ _loop = None
20
+
21
+
22
+ def _get_event_loop() -> asyncio.AbstractEventLoop:
23
+ """Get event loop for nebius sdk."""
24
+ global _loop
25
+
26
+ if _loop is not None:
27
+ return _loop
28
+
29
+ with _loop_lock:
30
+ if _loop is None:
31
+ # Create a new event loop in a dedicated thread
32
+ _loop = asyncio.new_event_loop()
33
+ threading.Thread(target=_loop.run_forever, daemon=True).start()
34
+
35
+ return _loop
36
+
37
+
38
+ def sync_call(awaitable: Awaitable[Any]) -> Any:
39
+ """Synchronously run an awaitable in coroutine.
40
+
41
+ This wrapper is used to workaround:
42
+ https://github.com/nebius/pysdk/issues/76
43
+
44
+ Uses a dedicated background event loop to avoid conflicts
45
+ with existing asyncio contexts and prevent BlockingIOError.
46
+ """
47
+ loop = _get_event_loop()
48
+ future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
49
+ return future.result()
50
+
51
+
52
+ async def _coro(awaitable: Awaitable[Any]) -> Any:
53
+ """Wrapper coroutine for awaitable."""
54
+ return await awaitable
55
+
14
56
 
15
57
  def tenant_id_path() -> str:
16
58
  return '~/.nebius/NEBIUS_TENANT_ID.txt'
sky/backends/backend.py CHANGED
@@ -147,8 +147,9 @@ class Backend(Generic[_ResourceHandleType]):
147
147
  def teardown(self,
148
148
  handle: _ResourceHandleType,
149
149
  terminate: bool,
150
- purge: bool = False) -> None:
151
- self._teardown(handle, terminate, purge)
150
+ purge: bool = False,
151
+ explicitly_requested: bool = False) -> None:
152
+ self._teardown(handle, terminate, purge, explicitly_requested)
152
153
 
153
154
  def register_info(self, **kwargs) -> None:
154
155
  """Register backend-specific information."""
@@ -200,5 +201,6 @@ class Backend(Generic[_ResourceHandleType]):
200
201
  def _teardown(self,
201
202
  handle: _ResourceHandleType,
202
203
  terminate: bool,
203
- purge: bool = False):
204
+ purge: bool = False,
205
+ explicitly_requested: bool = False):
204
206
  raise NotImplementedError
@@ -2017,7 +2017,15 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2017
2017
  if handle.cluster_yaml is None:
2018
2018
  # Remove cluster from db since this cluster does not have a config file
2019
2019
  # or any other ongoing requests
2020
- global_user_state.remove_cluster(cluster_name, terminate=True)
2020
+ global_user_state.add_cluster_event(
2021
+ cluster_name,
2022
+ None,
2023
+ 'Cluster has no YAML file. Removing the cluster from cache.',
2024
+ global_user_state.ClusterEventType.STATUS_CHANGE,
2025
+ nop_if_duplicate=True)
2026
+ global_user_state.remove_cluster(cluster_name,
2027
+ terminate=True,
2028
+ remove_events=True)
2021
2029
  logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
2022
2030
  'Removing the cluster from cache.')
2023
2031
  return None
@@ -2137,7 +2145,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2137
2145
  global_user_state.add_cluster_event(
2138
2146
  cluster_name,
2139
2147
  status_lib.ClusterStatus.UP,
2140
- 'All nodes up + ray cluster healthy.',
2148
+ 'All nodes up; SkyPilot runtime healthy.',
2141
2149
  global_user_state.ClusterEventType.STATUS_CHANGE,
2142
2150
  nop_if_duplicate=True)
2143
2151
  global_user_state.add_or_update_cluster(cluster_name,
@@ -2277,9 +2285,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2277
2285
  -1,
2278
2286
  autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
2279
2287
  stream_logs=False)
2280
- except exceptions.CommandError as e:
2288
+ except (exceptions.CommandError,
2289
+ grpc.FutureTimeoutError) as e:
2281
2290
  success = False
2282
- if e.returncode == 255:
2291
+ if isinstance(e, grpc.FutureTimeoutError) or (
2292
+ isinstance(e, exceptions.CommandError) and
2293
+ e.returncode == 255):
2283
2294
  word = 'autostopped' if noun == 'autostop' else 'autodowned'
2284
2295
  logger.debug(f'The cluster is likely {word}.')
2285
2296
  reset_local_autostop = False
@@ -2329,10 +2340,14 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2329
2340
  # are only stored for an hour by default), so it is possible that
2330
2341
  # the previous event has a status reason, but now it does not.
2331
2342
  init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
2343
+ log_message = f'Cluster is abnormal because {init_reason}'
2344
+ if status_reason:
2345
+ log_message += f' ({status_reason})'
2346
+ log_message += '. Transitioned to INIT.'
2332
2347
  global_user_state.add_cluster_event(
2333
2348
  cluster_name,
2334
2349
  status_lib.ClusterStatus.INIT,
2335
- f'Cluster is abnormal because {init_reason} ({status_reason}). Transitioned to INIT.',
2350
+ log_message,
2336
2351
  global_user_state.ClusterEventType.STATUS_CHANGE,
2337
2352
  nop_if_duplicate=True,
2338
2353
  duplicate_regex=init_reason_regex)
@@ -2345,10 +2360,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
2345
2360
  # Now is_abnormal is False: either node_statuses is empty or all nodes are
2346
2361
  # STOPPED.
2347
2362
  backend = backends.CloudVmRayBackend()
2348
- backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2349
2363
  global_user_state.add_cluster_event(
2350
- cluster_name, None, 'All nodes stopped, terminating cluster.',
2364
+ cluster_name, None, 'All nodes terminated, cleaning up the cluster.',
2351
2365
  global_user_state.ClusterEventType.STATUS_CHANGE)
2366
+ backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
2352
2367
  return global_user_state.get_cluster_from_name(cluster_name)
2353
2368
 
2354
2369
 
@@ -1368,8 +1368,11 @@ class RetryingVmProvisioner(object):
1368
1368
  if not dryrun:
1369
1369
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
1370
1370
  os.system(f'touch {log_path}')
1371
+
1371
1372
  rich_utils.force_update_status(
1372
- ux_utils.spinner_message('Launching', log_path))
1373
+ ux_utils.spinner_message('Launching',
1374
+ log_path,
1375
+ cluster_name=cluster_name))
1373
1376
 
1374
1377
  # Get previous cluster status
1375
1378
  cluster_exists = prev_cluster_status is not None
@@ -1539,6 +1542,7 @@ class RetryingVmProvisioner(object):
1539
1542
  requested_resources=requested_resources,
1540
1543
  ready=False,
1541
1544
  is_managed=self._is_managed,
1545
+ provision_log_path=log_abs_path,
1542
1546
  )
1543
1547
 
1544
1548
  # Add cluster event for actual provisioning start.
@@ -1684,7 +1688,9 @@ class RetryingVmProvisioner(object):
1684
1688
  config_dict['handle'] = handle
1685
1689
  logger.info(
1686
1690
  ux_utils.finishing_message(
1687
- f'Cluster launched: {cluster_name!r}.', log_path))
1691
+ f'Cluster launched: {cluster_name!r}.',
1692
+ log_path,
1693
+ cluster_name=cluster_name))
1688
1694
  return config_dict
1689
1695
 
1690
1696
  # The cluster is not ready. We must perform error recording and/or
@@ -1818,7 +1824,8 @@ class RetryingVmProvisioner(object):
1818
1824
  log_abs_path,
1819
1825
  stream_logs=False,
1820
1826
  start_streaming_at='Shared connection to',
1821
- line_processor=log_utils.RayUpLineProcessor(log_abs_path),
1827
+ line_processor=log_utils.RayUpLineProcessor(
1828
+ log_abs_path, cluster_name=cluster_handle.cluster_name),
1822
1829
  # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
1823
1830
  # time during 'ray up' if insufficient capacity occurs.
1824
1831
  env=dict(
@@ -3120,7 +3127,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3120
3127
  is_managed=self._is_managed)
3121
3128
  log_path = os.path.join(self.log_dir, 'provision.log')
3122
3129
  rich_utils.force_update_status(
3123
- ux_utils.spinner_message('Launching', log_path))
3130
+ ux_utils.spinner_message('Launching',
3131
+ log_path,
3132
+ cluster_name=cluster_name))
3124
3133
  config_dict = retry_provisioner.provision_with_retries(
3125
3134
  task, to_provision_config, dryrun, stream_logs,
3126
3135
  skip_unnecessary_provisioning)
@@ -3159,8 +3168,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3159
3168
  # Do not remove the stopped cluster from the global state
3160
3169
  # if failed to start.
3161
3170
  if not e.no_failover:
3171
+ global_user_state.add_cluster_event(
3172
+ cluster_name,
3173
+ None,
3174
+ 'Provision failed: ' + str(e),
3175
+ global_user_state.ClusterEventType.STATUS_CHANGE,
3176
+ nop_if_duplicate=True)
3162
3177
  global_user_state.remove_cluster(cluster_name,
3163
- terminate=True)
3178
+ terminate=True,
3179
+ remove_events=False)
3164
3180
  usage_lib.messages.usage.update_final_cluster_status(
3165
3181
  None)
3166
3182
  logger.error(
@@ -3962,7 +3978,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3962
3978
  def _teardown(self,
3963
3979
  handle: CloudVmRayResourceHandle,
3964
3980
  terminate: bool,
3965
- purge: bool = False):
3981
+ purge: bool = False,
3982
+ explicitly_requested: bool = False):
3966
3983
  """Tear down or stop the cluster.
3967
3984
 
3968
3985
  Args:
@@ -4037,7 +4054,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4037
4054
  # ClusterOwnerIdentityMismatchError. The argument/flag
4038
4055
  # `purge` should bypass such ID mismatch errors.
4039
4056
  refresh_cluster_status=(
4040
- not is_identity_mismatch_and_purge))
4057
+ not is_identity_mismatch_and_purge),
4058
+ explicitly_requested=explicitly_requested)
4041
4059
  if terminate:
4042
4060
  lock.force_unlock()
4043
4061
  break
@@ -4418,7 +4436,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4418
4436
  purge: bool = False,
4419
4437
  post_teardown_cleanup: bool = True,
4420
4438
  refresh_cluster_status: bool = True,
4421
- remove_from_db: bool = True) -> None:
4439
+ remove_from_db: bool = True,
4440
+ explicitly_requested: bool = False) -> None:
4422
4441
  """Teardown the cluster without acquiring the cluster status lock.
4423
4442
 
4424
4443
  NOTE: This method should not be called without holding the cluster
@@ -4482,7 +4501,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4482
4501
  f'provision yaml so it '
4483
4502
  'has not been provisioned. Skipped.')
4484
4503
  global_user_state.remove_cluster(handle.cluster_name,
4485
- terminate=terminate)
4504
+ terminate=terminate,
4505
+ remove_events=False)
4486
4506
  return
4487
4507
  log_path = os.path.join(os.path.expanduser(self.log_dir),
4488
4508
  'teardown.log')
@@ -4539,8 +4559,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4539
4559
  raise
4540
4560
 
4541
4561
  if post_teardown_cleanup:
4542
- self.post_teardown_cleanup(handle, terminate, purge,
4543
- remove_from_db)
4562
+ self.post_teardown_cleanup(
4563
+ handle,
4564
+ terminate,
4565
+ purge,
4566
+ remove_from_db,
4567
+ explicitly_requested=explicitly_requested)
4544
4568
  return
4545
4569
 
4546
4570
  if (isinstance(cloud, clouds.IBM) and terminate and
@@ -4640,7 +4664,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4640
4664
  terminate: bool,
4641
4665
  purge: bool = False,
4642
4666
  remove_from_db: bool = True,
4643
- failover: bool = False) -> None:
4667
+ failover: bool = False,
4668
+ explicitly_requested: bool = False) -> None:
4644
4669
  """Cleanup local configs/caches and delete TPUs after teardown.
4645
4670
 
4646
4671
  This method will handle the following cleanup steps:
@@ -4819,7 +4844,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4819
4844
 
4820
4845
  if not terminate or remove_from_db:
4821
4846
  global_user_state.remove_cluster(handle.cluster_name,
4822
- terminate=terminate)
4847
+ terminate=terminate,
4848
+ remove_events=explicitly_requested)
4823
4849
 
4824
4850
  def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
4825
4851
  """Remove the YAML config of a cluster."""
@@ -4928,11 +4954,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4928
4954
  # We cannot check if the cluster is autostopping.
4929
4955
  return False
4930
4956
  if handle.is_grpc_enabled:
4931
- request = autostopv1_pb2.IsAutostoppingRequest()
4932
- response = backend_utils.invoke_skylet_with_retries(
4933
- handle, lambda: SkyletClient(handle.get_grpc_channel()).
4934
- is_autostopping(request))
4935
- return response.is_autostopping
4957
+ try:
4958
+ request = autostopv1_pb2.IsAutostoppingRequest()
4959
+ response = backend_utils.invoke_skylet_with_retries(
4960
+ handle, lambda: SkyletClient(handle.get_grpc_channel()).
4961
+ is_autostopping(request))
4962
+ return response.is_autostopping
4963
+ except Exception as e: # pylint: disable=broad-except
4964
+ # The cluster may have been terminated, causing the gRPC call
4965
+ # to timeout and fail.
4966
+ logger.debug(f'Failed to check if cluster is autostopping: {e}')
4967
+ return False
4936
4968
  else:
4937
4969
  logger.info(
4938
4970
  'Using legacy remote execution for is_autostopping on '
@@ -256,7 +256,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
256
256
  logger.error(
257
257
  'Unable to run container - nvidia runtime for docker not '
258
258
  'found. Have you installed nvidia-docker on your machine?')
259
- global_user_state.remove_cluster(cluster_name, terminate=True)
259
+ global_user_state.remove_cluster(cluster_name,
260
+ terminate=True,
261
+ remove_events=False)
260
262
  raise e
261
263
  self.containers[handle] = container
262
264
  logger.info(
@@ -323,7 +325,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
323
325
  def _teardown(self,
324
326
  handle: LocalDockerResourceHandle,
325
327
  terminate: bool,
326
- purge: bool = False):
328
+ purge: bool = False,
329
+ explicitly_requested: bool = False):
327
330
  """Teardown kills the container."""
328
331
  del purge # Unused.
329
332
  if not terminate:
@@ -339,7 +342,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
339
342
  container.remove(force=True)
340
343
  cluster_name = handle.get_cluster_name()
341
344
 
342
- global_user_state.remove_cluster(cluster_name, terminate=True)
345
+ global_user_state.remove_cluster(cluster_name,
346
+ terminate=True,
347
+ remove_events=explicitly_requested)
343
348
 
344
349
  # --- Utilities ---
345
350
 
sky/client/cli/command.py CHANGED
@@ -60,6 +60,7 @@ from sky.client.cli import git
60
60
  from sky.data import storage_utils
61
61
  from sky.provision.kubernetes import constants as kubernetes_constants
62
62
  from sky.provision.kubernetes import utils as kubernetes_utils
63
+ from sky.schemas.api import responses
63
64
  from sky.server import common as server_common
64
65
  from sky.server import constants as server_constants
65
66
  from sky.server.requests import requests
@@ -123,7 +124,7 @@ def _get_cluster_records_and_set_ssh_config(
123
124
  clusters: Optional[List[str]],
124
125
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
125
126
  all_users: bool = False,
126
- ) -> List[dict]:
127
+ ) -> List[responses.StatusResponse]:
127
128
  """Returns a list of clusters that match the glob pattern.
128
129
 
129
130
  Args:
@@ -1562,7 +1563,7 @@ def _status_kubernetes(show_all: bool):
1562
1563
 
1563
1564
 
1564
1565
  def _show_endpoint(query_clusters: Optional[List[str]],
1565
- cluster_records: List[Dict[str, Any]], ip: bool,
1566
+ cluster_records: List[responses.StatusResponse], ip: bool,
1566
1567
  endpoints: bool, endpoint: Optional[int]) -> None:
1567
1568
  show_endpoints = endpoints or endpoint is not None
1568
1569
  show_single_endpoint = endpoint is not None
@@ -2171,6 +2172,10 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2171
2172
 
2172
2173
  @cli.command()
2173
2174
  @flags.config_option(expose_value=False)
2175
+ @click.option('--provision',
2176
+ is_flag=True,
2177
+ default=False,
2178
+ help='Stream the cluster provisioning logs (provision.log).')
2174
2179
  @click.option(
2175
2180
  '--sync-down',
2176
2181
  '-s',
@@ -2207,6 +2212,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2207
2212
  def logs(
2208
2213
  cluster: str,
2209
2214
  job_ids: Tuple[str, ...],
2215
+ provision: bool,
2210
2216
  sync_down: bool,
2211
2217
  status: bool, # pylint: disable=redefined-outer-name
2212
2218
  follow: bool,
@@ -2236,6 +2242,11 @@ def logs(
2236
2242
  4. If the job fails or fetching the logs fails, the command will exit with
2237
2243
  a non-zero return code.
2238
2244
  """
2245
+ if provision and (sync_down or status or job_ids):
2246
+ raise click.UsageError(
2247
+ '--provision cannot be combined with job log options '
2248
+ '(--sync-down/--status/job IDs).')
2249
+
2239
2250
  if sync_down and status:
2240
2251
  raise click.UsageError(
2241
2252
  'Both --sync_down and --status are specified '
@@ -2248,6 +2259,10 @@ def logs(
2248
2259
 
2249
2260
  job_ids = None if not job_ids else job_ids
2250
2261
 
2262
+ if provision:
2263
+ # Stream provision logs
2264
+ sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
2265
+
2251
2266
  if sync_down:
2252
2267
  with rich_utils.client_status(
2253
2268
  ux_utils.spinner_message('Downloading logs')):
@@ -4786,7 +4801,7 @@ def pool():
4786
4801
  type=str,
4787
4802
  nargs=-1,
4788
4803
  **_get_shell_complete_args(_complete_file_name))
4789
- @click.option('--pool-name',
4804
+ @click.option('--pool',
4790
4805
  '-p',
4791
4806
  default=None,
4792
4807
  type=str,
@@ -4808,7 +4823,7 @@ def pool():
4808
4823
  @usage_lib.entrypoint
4809
4824
  def jobs_pool_apply(
4810
4825
  pool_yaml: Tuple[str, ...],
4811
- pool_name: Optional[str],
4826
+ pool: Optional[str], # pylint: disable=redefined-outer-name
4812
4827
  workdir: Optional[str],
4813
4828
  infra: Optional[str],
4814
4829
  cloud: Optional[str],
@@ -4841,11 +4856,11 @@ def jobs_pool_apply(
4841
4856
  """
4842
4857
  cloud, region, zone = _handle_infra_cloud_region_zone_options(
4843
4858
  infra, cloud, region, zone)
4844
- if pool_name is None:
4845
- pool_name = serve_lib.generate_service_name(pool=True)
4859
+ if pool is None:
4860
+ pool = serve_lib.generate_service_name(pool=True)
4846
4861
 
4847
4862
  task = _generate_task_with_service(
4848
- service_name=pool_name,
4863
+ service_name=pool,
4849
4864
  service_yaml_args=pool_yaml,
4850
4865
  workdir=workdir,
4851
4866
  cloud=cloud,
@@ -4882,7 +4897,7 @@ def jobs_pool_apply(
4882
4897
  dag.add(task)
4883
4898
 
4884
4899
  request_id = managed_jobs.pool_apply(task,
4885
- pool_name,
4900
+ pool,
4886
4901
  mode=serve_lib.UpdateMode(mode),
4887
4902
  _need_confirmation=not yes)
4888
4903
  _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
@@ -5120,7 +5135,7 @@ def _handle_serve_logs(
5120
5135
  @usage_lib.entrypoint
5121
5136
  # TODO(tian): Add default argument for this CLI if none of the flags are
5122
5137
  # specified.
5123
- def pool_logs(
5138
+ def jobs_pool_logs(
5124
5139
  pool_name: str,
5125
5140
  follow: bool,
5126
5141
  controller: bool,
@@ -6037,7 +6052,7 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6037
6052
  # server accepts log_path-only streaming.
6038
6053
  req_id = (server_common.RequestId[None](request_id)
6039
6054
  if request_id is not None else None)
6040
- sdk.stream_and_get(req_id, log_path, tail, follow=follow)
6055
+ sdk.stream_and_get(req_id, log_path, tail, follow)
6041
6056
 
6042
6057
 
6043
6058
  @api.command('cancel', cls=_DocumentedCodeCommand)
sky/client/sdk.py CHANGED
@@ -855,6 +855,56 @@ def tail_logs(cluster_name: str,
855
855
  resumable=(tail == 0))
856
856
 
857
857
 
858
+ @usage_lib.entrypoint
859
+ @server_common.check_server_healthy_or_start
860
+ @versions.minimal_api_version(17)
861
+ @annotations.client_api
862
+ @rest.retry_transient_errors()
863
+ def tail_provision_logs(cluster_name: str,
864
+ follow: bool = True,
865
+ tail: int = 0,
866
+ output_stream: Optional['io.TextIOBase'] = None) -> int:
867
+ """Tails the provisioning logs (provision.log) for a cluster.
868
+
869
+ Args:
870
+ cluster_name: name of the cluster.
871
+ follow: follow the logs.
872
+ tail: lines from end to tail.
873
+ output_stream: optional stream to write logs.
874
+ Returns:
875
+ Exit code 0 on streaming success; raises on HTTP error.
876
+ """
877
+ body = payloads.ClusterNameBody(cluster_name=cluster_name)
878
+ params = {
879
+ 'follow': str(follow).lower(),
880
+ 'tail': tail,
881
+ }
882
+ response = server_common.make_authenticated_request(
883
+ 'POST',
884
+ '/provision_logs',
885
+ json=json.loads(body.model_dump_json()),
886
+ params=params,
887
+ stream=True,
888
+ timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
889
+ None))
890
+ # Log request is idempotent when tail is 0, thus can resume previous
891
+ # streaming point on retry.
892
+ # request_id=None here because /provision_logs does not create an async
893
+ # request. Instead, it streams a plain file from the server. This does NOT
894
+ # violate the stream_response doc warning about None in multi-user
895
+ # environments: we are not asking stream_response to select “the latest
896
+ # request”. We already have the HTTP response to stream; request_id=None
897
+ # merely disables the follow-up GET. It is also necessary for --no-follow
898
+ # to return cleanly after printing the tailed lines. If we provided a
899
+ # non-None request_id here, the get(request_id) in stream_response(
900
+ # would fail since /provision_logs does not create a request record.
901
+ stream_response(request_id=None,
902
+ response=response,
903
+ output_stream=output_stream,
904
+ resumable=(tail == 0))
905
+ return 0
906
+
907
+
858
908
  @usage_lib.entrypoint
859
909
  @server_common.check_server_healthy_or_start
860
910
  @annotations.client_api
@@ -1322,7 +1372,7 @@ def status(
1322
1372
  cluster_names: Optional[List[str]] = None,
1323
1373
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
1324
1374
  all_users: bool = False,
1325
- ) -> server_common.RequestId[List[Dict[str, Any]]]:
1375
+ ) -> server_common.RequestId[List[responses.StatusResponse]]:
1326
1376
  """Gets cluster statuses.
1327
1377
 
1328
1378
  If cluster_names is given, return those clusters. Otherwise, return all
sky/clouds/kubernetes.py CHANGED
@@ -3,7 +3,6 @@ import os
3
3
  import re
4
4
  import subprocess
5
5
  import tempfile
6
- import typing
7
6
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
8
7
 
9
8
  import colorama
@@ -11,6 +10,7 @@ import colorama
11
10
  from sky import catalog
12
11
  from sky import clouds
13
12
  from sky import exceptions
13
+ from sky import resources as resources_lib
14
14
  from sky import sky_logging
15
15
  from sky import skypilot_config
16
16
  from sky.adaptors import kubernetes
@@ -31,10 +31,6 @@ from sky.utils import resources_utils
31
31
  from sky.utils import schemas
32
32
  from sky.utils import volume as volume_lib
33
33
 
34
- if typing.TYPE_CHECKING:
35
- # Renaming to avoid shadowing variables.
36
- from sky import resources as resources_lib
37
-
38
34
  logger = sky_logging.init_logger(__name__)
39
35
 
40
36
  # Namespace for SkyPilot resources shared across multiple tenants on the
@@ -773,7 +769,7 @@ class Kubernetes(clouds.Cloud):
773
769
 
774
770
  @staticmethod
775
771
  def _warn_on_disk_size(resources: 'resources_lib.Resources'):
776
- if resources.disk_size is not None:
772
+ if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
777
773
  logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
778
774
  'is not supported by Kubernetes. '
779
775
  'To add additional disk, use volumes.'
sky/clouds/nebius.py CHANGED
@@ -442,7 +442,9 @@ class Nebius(clouds.Cloud):
442
442
  del workspace_config # Unused
443
443
  sdk = nebius.sdk()
444
444
  profile_client = nebius.iam().ProfileServiceClient(sdk)
445
- profile = profile_client.get(nebius.iam().GetProfileRequest()).wait()
445
+ profile = nebius.sync_call(
446
+ profile_client.get(nebius.iam().GetProfileRequest(),
447
+ timeout=nebius.READ_TIMEOUT))
446
448
  if profile.user_profile is not None:
447
449
  if profile.user_profile.attributes is None:
448
450
  raise exceptions.CloudUserIdentityError(
sky/core.py CHANGED
@@ -25,6 +25,7 @@ from sky.clouds import cloud as sky_cloud
25
25
  from sky.jobs.server import core as managed_jobs_core
26
26
  from sky.provision.kubernetes import constants as kubernetes_constants
27
27
  from sky.provision.kubernetes import utils as kubernetes_utils
28
+ from sky.schemas.api import responses
28
29
  from sky.skylet import autostop_lib
29
30
  from sky.skylet import constants
30
31
  from sky.skylet import job_lib
@@ -95,7 +96,7 @@ def status(
95
96
  cluster_names: Optional[Union[str, List[str]]] = None,
96
97
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
97
98
  all_users: bool = False,
98
- ) -> List[Dict[str, Any]]:
99
+ ) -> List[responses.StatusResponse]:
99
100
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
100
101
  """Gets cluster statuses.
101
102
 
@@ -171,7 +172,9 @@ def status(
171
172
  clusters = backend_utils.get_clusters(refresh=refresh,
172
173
  cluster_names=cluster_names,
173
174
  all_users=all_users)
174
- return clusters
175
+ return [
176
+ responses.StatusResponse.model_validate(cluster) for cluster in clusters
177
+ ]
175
178
 
176
179
 
177
180
  def status_kubernetes(
@@ -593,7 +596,10 @@ def down(cluster_name: str, purge: bool = False) -> None:
593
596
 
594
597
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
595
598
  backend = backend_utils.get_backend_from_handle(handle)
596
- backend.teardown(handle, terminate=True, purge=purge)
599
+ backend.teardown(handle,
600
+ terminate=True,
601
+ purge=purge,
602
+ explicitly_requested=True)
597
603
 
598
604
 
599
605
  @usage_lib.entrypoint
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-00c0a51d21157453.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c2ea34fda4f1f8c8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/Y0eNlwi85qGRecLTin11y/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"Y0eNlwi85qGRecLTin11y","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6987eb47888da9c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yW7-Bc1l0EwIosbauU8LZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"yW7-Bc1l0EwIosbauU8LZ","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>