skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (136) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend.py +5 -3
  4. sky/backends/backend_utils.py +22 -7
  5. sky/backends/cloud_vm_ray_backend.py +50 -18
  6. sky/backends/local_docker_backend.py +8 -3
  7. sky/client/cli/command.py +25 -10
  8. sky/client/sdk.py +51 -1
  9. sky/clouds/kubernetes.py +2 -6
  10. sky/clouds/nebius.py +3 -1
  11. sky/core.py +9 -3
  12. sky/dashboard/out/404.html +1 -1
  13. sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  19. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  20. sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  23. sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  27. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
  28. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  29. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  32. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
  41. sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
  47. sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
  48. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  49. sky/dashboard/out/clusters/[cluster].html +1 -1
  50. sky/dashboard/out/clusters.html +1 -1
  51. sky/dashboard/out/config.html +1 -1
  52. sky/dashboard/out/index.html +1 -1
  53. sky/dashboard/out/infra/[context].html +1 -1
  54. sky/dashboard/out/infra.html +1 -1
  55. sky/dashboard/out/jobs/[job].html +1 -1
  56. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  57. sky/dashboard/out/jobs.html +1 -1
  58. sky/dashboard/out/users.html +1 -1
  59. sky/dashboard/out/volumes.html +1 -1
  60. sky/dashboard/out/workspace/new.html +1 -1
  61. sky/dashboard/out/workspaces/[name].html +1 -1
  62. sky/dashboard/out/workspaces.html +1 -1
  63. sky/data/storage_utils.py +29 -9
  64. sky/execution.py +13 -10
  65. sky/global_user_state.py +131 -2
  66. sky/jobs/constants.py +1 -1
  67. sky/jobs/recovery_strategy.py +0 -3
  68. sky/jobs/scheduler.py +14 -21
  69. sky/jobs/server/core.py +64 -10
  70. sky/jobs/server/utils.py +1 -1
  71. sky/jobs/state.py +1 -3
  72. sky/jobs/utils.py +159 -11
  73. sky/provision/aws/config.py +19 -3
  74. sky/provision/aws/instance.py +2 -1
  75. sky/provision/kubernetes/instance.py +2 -1
  76. sky/provision/nebius/utils.py +101 -86
  77. sky/provision/provisioner.py +13 -8
  78. sky/resources.py +5 -5
  79. sky/schemas/api/responses.py +50 -1
  80. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  81. sky/serve/replica_managers.py +123 -101
  82. sky/serve/serve_state.py +32 -0
  83. sky/serve/serve_utils.py +37 -16
  84. sky/serve/service.py +51 -17
  85. sky/server/common.py +2 -3
  86. sky/server/constants.py +1 -1
  87. sky/server/requests/payloads.py +6 -0
  88. sky/server/requests/serializers/decoders.py +20 -5
  89. sky/server/requests/serializers/encoders.py +21 -8
  90. sky/server/server.py +57 -11
  91. sky/templates/kubernetes-ray.yml.j2 +1 -0
  92. sky/utils/cli_utils/status_utils.py +2 -1
  93. sky/utils/common_utils.py +20 -0
  94. sky/utils/controller_utils.py +17 -4
  95. sky/utils/db/migration_utils.py +1 -1
  96. sky/utils/log_utils.py +14 -5
  97. sky/utils/resources_utils.py +25 -1
  98. sky/utils/schemas.py +3 -0
  99. sky/utils/ux_utils.py +36 -5
  100. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
  101. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
  102. sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  104. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  105. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  106. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  109. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  110. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  111. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  114. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  117. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  119. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
  121. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
  131. /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
  132. /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
  133. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/serve/service.py CHANGED
@@ -113,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
113
113
  return not failed
114
114
 
115
115
 
116
+ # NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
117
+ # because we killed all the processes (controller & replica manager) before
118
+ # calling this function.
116
119
  def _cleanup(service_name: str) -> bool:
117
120
  """Clean up all service related resources, i.e. replicas and storage."""
118
121
  # Cleanup the HA recovery script first as it is possible that some error
@@ -135,28 +138,59 @@ def _cleanup(service_name: str) -> bool:
135
138
  continue
136
139
  p = multiprocessing.Process(target=replica_managers.terminate_cluster,
137
140
  args=(info.cluster_name,))
138
- p.start()
139
141
  info2proc[info] = p
140
142
  # Set replica status to `SHUTTING_DOWN`
141
143
  info.status_property.sky_launch_status = (
142
- replica_managers.ProcessStatus.SUCCEEDED)
144
+ replica_managers.common_utils.ProcessStatus.SUCCEEDED)
143
145
  info.status_property.sky_down_status = (
144
- replica_managers.ProcessStatus.RUNNING)
146
+ replica_managers.common_utils.ProcessStatus.SCHEDULED)
145
147
  serve_state.add_or_update_replica(service_name, info.replica_id, info)
146
- logger.info(f'Terminating replica {info.replica_id} ...')
147
- for info, p in info2proc.items():
148
- p.join()
149
- if p.exitcode == 0:
150
- serve_state.remove_replica(service_name, info.replica_id)
151
- logger.info(f'Replica {info.replica_id} terminated successfully.')
152
- else:
153
- # Set replica status to `FAILED_CLEANUP`
154
- info.status_property.sky_down_status = (
155
- replica_managers.ProcessStatus.FAILED)
156
- serve_state.add_or_update_replica(service_name, info.replica_id,
157
- info)
158
- failed = True
159
- logger.error(f'Replica {info.replica_id} failed to terminate.')
148
+ logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
149
+
150
+ def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
151
+ nonlocal failed
152
+ # Set replica status to `FAILED_CLEANUP`
153
+ info.status_property.sky_down_status = (
154
+ replica_managers.common_utils.ProcessStatus.FAILED)
155
+ serve_state.add_or_update_replica(service_name, info.replica_id, info)
156
+ failed = True
157
+ logger.error(f'Replica {info.replica_id} failed to terminate.')
158
+
159
+ # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
160
+ # TODO(tian): Refactor to use the same logic and code.
161
+ while info2proc:
162
+ snapshot = list(info2proc.items())
163
+ for info, p in snapshot:
164
+ if p.is_alive():
165
+ continue
166
+ if (info.status_property.sky_down_status ==
167
+ replica_managers.common_utils.ProcessStatus.SCHEDULED):
168
+ if controller_utils.can_terminate():
169
+ try:
170
+ p.start()
171
+ except Exception as e: # pylint: disable=broad-except
172
+ _set_to_failed_cleanup(info)
173
+ logger.error(f'Failed to start process for replica '
174
+ f'{info.replica_id}: {e}')
175
+ del info2proc[info]
176
+ else:
177
+ info.status_property.sky_down_status = (
178
+ common_utils.ProcessStatus.RUNNING)
179
+ serve_state.add_or_update_replica(
180
+ service_name, info.replica_id, info)
181
+ else:
182
+ logger.info('Terminate process for replica '
183
+ f'{info.replica_id} finished.')
184
+ p.join()
185
+ del info2proc[info]
186
+ if p.exitcode == 0:
187
+ serve_state.remove_replica(service_name, info.replica_id)
188
+ logger.info(
189
+ f'Replica {info.replica_id} terminated successfully.')
190
+ else:
191
+ _set_to_failed_cleanup(info)
192
+ time.sleep(3)
193
+
160
194
  versions = serve_state.get_service_versions(service_name)
161
195
  serve_state.remove_service_versions(service_name)
162
196
 
sky/server/common.py CHANGED
@@ -5,7 +5,6 @@ import enum
5
5
  import functools
6
6
  from http.cookiejar import CookieJar
7
7
  from http.cookiejar import MozillaCookieJar
8
- import json
9
8
  import os
10
9
  import pathlib
11
10
  import re
@@ -372,7 +371,7 @@ def _handle_non_200_server_status(
372
371
  '') == ApiServerStatus.VERSION_MISMATCH.value):
373
372
  return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
374
373
  error=body.get('message', ''))
375
- except json.JSONDecodeError:
374
+ except requests.JSONDecodeError:
376
375
  pass
377
376
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
378
377
 
@@ -463,7 +462,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
463
462
  # OAuth.
464
463
  set_api_cookie_jar(cookies, create_if_not_exists=True)
465
464
  return server_info
466
- except (json.JSONDecodeError, AttributeError) as e:
465
+ except (requests.JSONDecodeError, AttributeError) as e:
467
466
  # Try to check if we got redirected to a login page.
468
467
  for prev_response in response.history:
469
468
  logger.debug(f'Previous response: {prev_response.url}')
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 16
13
+ API_VERSION = 17
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
497
497
  skip_finished: bool = False
498
498
  all_users: bool = False
499
499
  job_ids: Optional[List[int]] = None
500
+ user_match: Optional[str] = None
501
+ workspace_match: Optional[str] = None
502
+ name_match: Optional[str] = None
503
+ pool_match: Optional[str] = None
504
+ page: Optional[int] = None
505
+ limit: Optional[int] = None
500
506
 
501
507
 
502
508
  class JobsCancelBody(RequestBody):
@@ -9,6 +9,7 @@ from sky import models
9
9
  from sky.catalog import common
10
10
  from sky.data import storage
11
11
  from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.schemas.api import responses
12
13
  from sky.serve import serve_state
13
14
  from sky.server import constants as server_constants
14
15
  from sky.skylet import job_lib
@@ -50,13 +51,17 @@ def default_decode_handler(return_value: Any) -> Any:
50
51
 
51
52
 
52
53
  @register_decoders('status')
53
- def decode_status(return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
54
+ def decode_status(
55
+ return_value: List[Dict[str, Any]]) -> List[responses.StatusResponse]:
54
56
  clusters = return_value
57
+ response = []
55
58
  for cluster in clusters:
56
59
  cluster['handle'] = decode_and_unpickle(cluster['handle'])
57
60
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
58
-
59
- return clusters
61
+ cluster['storage_mounts_metadata'] = decode_and_unpickle(
62
+ cluster['storage_mounts_metadata'])
63
+ response.append(responses.StatusResponse.model_validate(cluster))
64
+ return response
60
65
 
61
66
 
62
67
  @register_decoders('status_kubernetes')
@@ -102,8 +107,18 @@ def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
102
107
 
103
108
 
104
109
  @register_decoders('jobs.queue')
105
- def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
106
- jobs = return_value
110
+ def decode_jobs_queue(return_value):
111
+ """Decode jobs queue response.
112
+
113
+ Supports legacy list, or a dict {jobs, total}.
114
+ - Returns list[job]
115
+ """
116
+ # Case 1: dict shape {jobs, total}
117
+ if isinstance(return_value, dict) and 'jobs' in return_value:
118
+ jobs = return_value.get('jobs', [])
119
+ else:
120
+ # Case 2: legacy list
121
+ jobs = return_value
107
122
  for job in jobs:
108
123
  job['status'] = managed_jobs.ManagedJobStatus(job['status'])
109
124
  return jobs
@@ -8,6 +8,7 @@ import pickle
8
8
  import typing
9
9
  from typing import Any, Dict, List, Optional, Tuple
10
10
 
11
+ from sky.schemas.api import responses
11
12
  from sky.server import constants as server_constants
12
13
 
13
14
  if typing.TYPE_CHECKING:
@@ -51,13 +52,17 @@ def default_encoder(return_value: Any) -> Any:
51
52
 
52
53
 
53
54
  @register_encoder('status')
54
- def encode_status(clusters: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
55
+ def encode_status(
56
+ clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
57
+ response = []
55
58
  for cluster in clusters:
56
- cluster['status'] = cluster['status'].value
57
- cluster['handle'] = pickle_and_encode(cluster['handle'])
58
- cluster['storage_mounts_metadata'] = pickle_and_encode(
59
- cluster['storage_mounts_metadata'])
60
- return clusters
59
+ response_cluster = cluster.model_dump()
60
+ response_cluster['status'] = cluster['status'].value
61
+ response_cluster['handle'] = pickle_and_encode(cluster['handle'])
62
+ response_cluster['storage_mounts_metadata'] = pickle_and_encode(
63
+ response_cluster['storage_mounts_metadata'])
64
+ response.append(response_cluster)
65
+ return response
61
66
 
62
67
 
63
68
  @register_encoder('launch', 'exec', 'jobs.launch')
@@ -106,10 +111,18 @@ def encode_status_kubernetes(
106
111
 
107
112
 
108
113
  @register_encoder('jobs.queue')
109
- def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
114
+ def encode_jobs_queue(jobs_or_tuple):
115
+ # Support returning either a plain jobs list or a (jobs, total) tuple
116
+ if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
117
+ jobs, total = jobs_or_tuple
118
+ else:
119
+ jobs = jobs_or_tuple
120
+ total = None
110
121
  for job in jobs:
111
122
  job['status'] = job['status'].value
112
- return jobs
123
+ if total is None:
124
+ return jobs
125
+ return {'jobs': jobs, 'total': total}
113
126
 
114
127
 
115
128
  def _encode_serve_status(
sky/server/server.py CHANGED
@@ -792,8 +792,6 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
792
792
  ctx.override_envs(validate_body.env_vars)
793
793
 
794
794
  def validate_dag(dag: dag_utils.dag_lib.Dag):
795
- # Resolve the volumes before admin policy and validation.
796
- dag.resolve_and_validate_volumes()
797
795
  # TODO: Admin policy may contain arbitrary code, which may be expensive
798
796
  # to run and may block the server thread. However, moving it into the
799
797
  # executor adds a ~150ms penalty on the local API server because of
@@ -802,6 +800,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
802
800
  with admin_policy_utils.apply_and_use_config_in_current_request(
803
801
  dag,
804
802
  request_options=validate_body.get_request_options()) as dag:
803
+ dag.resolve_and_validate_volumes()
805
804
  # Skip validating workdir and file_mounts, as those need to be
806
805
  # validated after the files are uploaded to the SkyPilot API server
807
806
  # with `upload_mounts_to_api_server`.
@@ -1233,7 +1232,8 @@ async def download_logs(
1233
1232
 
1234
1233
 
1235
1234
  @app.post('/download')
1236
- async def download(download_body: payloads.DownloadBody) -> None:
1235
+ async def download(download_body: payloads.DownloadBody,
1236
+ request: fastapi.Request) -> None:
1237
1237
  """Downloads a folder from the cluster to the local machine."""
1238
1238
  folder_paths = [
1239
1239
  pathlib.Path(folder_path) for folder_path in download_body.folder_paths
@@ -1262,7 +1262,16 @@ async def download(download_body: payloads.DownloadBody) -> None:
1262
1262
  str(folder_path.expanduser().resolve())
1263
1263
  for folder_path in folder_paths
1264
1264
  ]
1265
- storage_utils.zip_files_and_folders(folders, zip_path)
1265
+ # Check for optional query parameter to control zip entry structure
1266
+ relative = request.query_params.get('relative', 'home')
1267
+ if relative == 'items':
1268
+ # Dashboard-friendly: entries relative to selected folders
1269
+ storage_utils.zip_files_and_folders(folders,
1270
+ zip_path,
1271
+ relative_to_items=True)
1272
+ else:
1273
+ # CLI-friendly (default): entries with full paths for mapping
1274
+ storage_utils.zip_files_and_folders(folders, zip_path)
1266
1275
 
1267
1276
  # Add home path to the response headers, so that the client can replace
1268
1277
  # the remote path in the zip file to the local path.
@@ -1284,6 +1293,46 @@ async def download(download_body: payloads.DownloadBody) -> None:
1284
1293
  detail=f'Error creating zip file: {str(e)}')
1285
1294
 
1286
1295
 
1296
+ @app.post('/provision_logs')
1297
+ async def provision_logs(cluster_body: payloads.ClusterNameBody,
1298
+ follow: bool = True,
1299
+ tail: int = 0) -> fastapi.responses.StreamingResponse:
1300
+ """Streams the provision.log for the latest launch request of a cluster."""
1301
+ # Prefer clusters table first, then cluster_history as fallback.
1302
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1303
+ cluster_body.cluster_name)
1304
+ if not log_path_str:
1305
+ log_path_str = global_user_state.get_cluster_history_provision_log_path(
1306
+ cluster_body.cluster_name)
1307
+ if not log_path_str:
1308
+ raise fastapi.HTTPException(
1309
+ status_code=404,
1310
+ detail=('Provision log path is not recorded for this cluster. '
1311
+ 'Please relaunch to generate provisioning logs.'))
1312
+
1313
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1314
+ if not log_path.exists():
1315
+ raise fastapi.HTTPException(
1316
+ status_code=404,
1317
+ detail=f'Provision log path does not exist: {str(log_path)}')
1318
+
1319
+ # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1320
+ effective_tail = None if tail is None or tail <= 0 else tail
1321
+
1322
+ return fastapi.responses.StreamingResponse(
1323
+ content=stream_utils.log_streamer(None,
1324
+ log_path,
1325
+ tail=effective_tail,
1326
+ follow=follow),
1327
+ media_type='text/plain',
1328
+ headers={
1329
+ 'Cache-Control': 'no-cache, no-transform',
1330
+ 'X-Accel-Buffering': 'no',
1331
+ 'Transfer-Encoding': 'chunked',
1332
+ },
1333
+ )
1334
+
1335
+
1287
1336
  @app.post('/cost_report')
1288
1337
  async def cost_report(request: fastapi.Request,
1289
1338
  cost_report_body: payloads.CostReportBody) -> None:
@@ -1541,13 +1590,7 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1541
1590
  """Checks the health of the API server.
1542
1591
 
1543
1592
  Returns:
1544
- A dictionary with the following keys:
1545
- - status: str; The status of the API server.
1546
- - api_version: str; The API version of the API server.
1547
- - version: str; The version of SkyPilot used for API server.
1548
- - version_on_disk: str; The version of the SkyPilot installation on
1549
- disk, which can be used to warn about restarting the API server
1550
- - commit: str; The commit hash of SkyPilot used for API server.
1593
+ responses.APIHealthResponse: The health response.
1551
1594
  """
1552
1595
  user = request.state.auth_user
1553
1596
  server_status = common.ApiServerStatus.HEALTHY
@@ -1815,6 +1858,9 @@ if __name__ == '__main__':
1815
1858
  global_tasks.append(background.create_task(metrics_server.serve()))
1816
1859
  global_tasks.append(
1817
1860
  background.create_task(requests_lib.requests_gc_daemon()))
1861
+ global_tasks.append(
1862
+ background.create_task(
1863
+ global_user_state.cluster_event_retention_daemon()))
1818
1864
  threading.Thread(target=background.run_forever, daemon=True).start()
1819
1865
 
1820
1866
  queue_server, workers = executor.start(config)
@@ -378,6 +378,7 @@ available_node_types:
378
378
  {% if volume_mounts %}
379
379
  securityContext:
380
380
  fsGroup: 1000
381
+ fsGroupChangePolicy: OnRootMismatch
381
382
  {% endif %}
382
383
 
383
384
  # Add node selector if GPU/TPUs are requested:
@@ -6,6 +6,7 @@ import click
6
6
  import colorama
7
7
 
8
8
  from sky import backends
9
+ from sky.schemas.api import responses
9
10
  from sky.utils import common_utils
10
11
  from sky.utils import log_utils
11
12
  from sky.utils import resources_utils
@@ -44,7 +45,7 @@ class StatusColumn:
44
45
  return val
45
46
 
46
47
 
47
- def show_status_table(cluster_records: List[_ClusterRecord],
48
+ def show_status_table(cluster_records: List[responses.StatusResponse],
48
49
  show_all: bool,
49
50
  show_user: bool,
50
51
  query_clusters: Optional[List[str]] = None,
sky/utils/common_utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Utils shared between all of sky"""
2
2
 
3
3
  import difflib
4
+ import enum
4
5
  import functools
5
6
  import getpass
6
7
  import hashlib
@@ -55,6 +56,25 @@ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
55
56
  logger = sky_logging.init_logger(__name__)
56
57
 
57
58
 
59
+ class ProcessStatus(enum.Enum):
60
+ """Process status."""
61
+
62
+ # The process is scheduled to run, but not started yet.
63
+ SCHEDULED = 'SCHEDULED'
64
+
65
+ # The process is running
66
+ RUNNING = 'RUNNING'
67
+
68
+ # The process is finished and succeeded
69
+ SUCCEEDED = 'SUCCEEDED'
70
+
71
+ # The process is interrupted
72
+ INTERRUPTED = 'INTERRUPTED'
73
+
74
+ # The process failed
75
+ FAILED = 'FAILED'
76
+
77
+
58
78
  @annotations.lru_cache(scope='request')
59
79
  def get_usage_run_id() -> str:
60
80
  """Returns a unique run id for each 'run'.
@@ -1224,13 +1224,26 @@ def _get_launch_parallelism() -> int:
1224
1224
 
1225
1225
 
1226
1226
  def can_provision() -> bool:
1227
- num_provision = (
1228
- serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
1229
- managed_job_state.get_num_launching_jobs())
1230
- return num_provision < _get_launch_parallelism()
1227
+ # We always prioritize terminating over provisioning, to save the cost on
1228
+ # idle resources.
1229
+ if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
1230
+ return False
1231
+ return can_terminate()
1231
1232
 
1232
1233
 
1233
1234
  def can_start_new_process() -> bool:
1234
1235
  num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
1235
1236
  managed_job_state.get_num_alive_jobs())
1236
1237
  return num_procs < _get_job_parallelism()
1238
+
1239
+
1240
+ # We limit the number of terminating replicas to the number of CPUs. This is
1241
+ # just a temporary solution to avoid overwhelming the controller. After one job
1242
+ # controller PR, we should use API server to handle resources management.
1243
+ def can_terminate() -> bool:
1244
+ num_terminating = (
1245
+ serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
1246
+ # Each terminate process will take roughly the same CPUs as job launch.
1247
+ serve_state.total_number_terminating_replicas() +
1248
+ managed_job_state.get_num_launching_jobs())
1249
+ return num_terminating < _get_launch_parallelism()
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '005'
22
+ GLOBAL_USER_STATE_VERSION = '006'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
sky/utils/log_utils.py CHANGED
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
47
47
  RUNTIME_SETUP = 1
48
48
  PULLING_DOCKER_IMAGES = 2
49
49
 
50
- def __init__(self, log_path: str):
50
+ def __init__(self, log_path: str, cluster_name: Optional[str] = None):
51
51
  self.log_path = log_path
52
+ self.cluster_name = cluster_name
52
53
 
53
54
  def __enter__(self) -> None:
54
55
  self.state = self.ProvisionStatus.LAUNCH
55
56
  self.status_display = rich_utils.safe_status(
56
- ux_utils.spinner_message('Launching', self.log_path))
57
+ ux_utils.spinner_message('Launching',
58
+ self.log_path,
59
+ cluster_name=self.cluster_name))
57
60
  self.status_display.start()
58
61
 
59
62
  def process_line(self, log_line: str) -> None:
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
62
65
  logger.info(' Head VM is up.')
63
66
  self.status_display.update(
64
67
  ux_utils.spinner_message(
65
- 'Launching - Preparing SkyPilot runtime', self.log_path))
68
+ 'Launching - Preparing SkyPilot runtime',
69
+ self.log_path,
70
+ cluster_name=self.cluster_name))
66
71
  self.state = self.ProvisionStatus.RUNTIME_SETUP
67
72
  if ('Pulling from' in log_line and
68
73
  self.state == self.ProvisionStatus.RUNTIME_SETUP):
69
74
  self.status_display.update(
70
75
  ux_utils.spinner_message(
71
- 'Launching - Initializing docker container', self.log_path))
76
+ 'Launching - Initializing docker container',
77
+ self.log_path,
78
+ cluster_name=self.cluster_name))
72
79
  self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
73
80
  if ('Status: Downloaded newer image' in log_line and
74
81
  self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
75
82
  self.status_display.update(
76
83
  ux_utils.spinner_message(
77
- 'Launching - Preparing SkyPilot runtime', self.log_path))
84
+ 'Launching - Preparing SkyPilot runtime',
85
+ self.log_path,
86
+ cluster_name=self.cluster_name))
78
87
  self.state = self.ProvisionStatus.RUNTIME_SETUP
79
88
 
80
89
  def __exit__(self, except_type: Optional[Type[BaseException]],
@@ -5,7 +5,7 @@ import itertools
5
5
  import json
6
6
  import math
7
7
  import typing
8
- from typing import Dict, List, Optional, Set, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
9
 
10
10
  from sky import skypilot_config
11
11
  from sky.skylet import constants
@@ -435,3 +435,27 @@ def parse_time_minutes(time: str) -> int:
435
435
  continue
436
436
 
437
437
  raise ValueError(f'Invalid time format: {time}')
438
+
439
+
440
+ def normalize_any_of_resources_config(
441
+ any_of: List[Dict[str, Any]]) -> Tuple[str, ...]:
442
+ """Normalize a list of any_of resources config to a canonical form.
443
+
444
+ Args:
445
+ any_of: A list of any_of resources config.
446
+
447
+ Returns:
448
+ A normalized tuple representation that can be compared for equality.
449
+ Two lists with the same resource configurations in different orders
450
+ will produce the same normalized result.
451
+ """
452
+ if not any_of:
453
+ return tuple()
454
+
455
+ # Convert each config to JSON string with sorted keys, then sort the list
456
+ normalized_configs = [
457
+ json.dumps(config, sort_keys=True, separators=(',', ':'))
458
+ for config in any_of
459
+ ]
460
+
461
+ return tuple(sorted(normalized_configs))
sky/utils/schemas.py CHANGED
@@ -1535,6 +1535,9 @@ def get_config_schema():
1535
1535
  'requests_retention_hours': {
1536
1536
  'type': 'integer',
1537
1537
  },
1538
+ 'cluster_event_retention_hours': {
1539
+ 'type': 'number',
1540
+ },
1538
1541
  }
1539
1542
  }
1540
1543
 
sky/utils/ux_utils.py CHANGED
@@ -26,9 +26,16 @@ BOLD = '\033[1m'
26
26
  RESET_BOLD = '\033[0m'
27
27
 
28
28
  # Log path hint in the spinner during launching
29
+ # (old, kept for backward compatibility)
29
30
  _LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
30
31
  '{log_path}'
31
32
  f'{colorama.Style.RESET_ALL}')
33
+ # Log hint: recommend sky logs --provision <cluster_name>
34
+ _PROVISION_LOG_HINT = (
35
+ f'{colorama.Style.DIM}View logs: '
36
+ f'{BOLD}sky logs --provision {{cluster_name}}{RESET_BOLD}'
37
+ f'{colorama.Style.RESET_ALL}')
38
+ # Legacy path hint retained for local-only cases where we don't have cluster
32
39
  _LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
33
40
  '{log_path}'
34
41
  f'{colorama.Style.RESET_ALL}')
@@ -126,7 +133,10 @@ class RedirectOutputForProcess:
126
133
 
127
134
  def log_path_hint(log_path: Union[str, 'pathlib.Path'],
128
135
  is_local: bool = False) -> str:
129
- """Gets the log path hint for the given log path."""
136
+ """Gets the log path hint for the given log path.
137
+
138
+ Kept for backward compatibility when only paths are available.
139
+ """
130
140
  log_path = str(log_path)
131
141
  expanded_home = os.path.expanduser('~')
132
142
  if log_path.startswith(expanded_home):
@@ -139,6 +149,12 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path'],
139
149
  return _LOG_PATH_HINT.format(log_path=log_path)
140
150
 
141
151
 
152
+ def provision_hint(cluster_name: Optional[str]) -> Optional[str]:
153
+ if not cluster_name:
154
+ return None
155
+ return _PROVISION_LOG_HINT.format(cluster_name=cluster_name)
156
+
157
+
142
158
  def starting_message(message: str) -> str:
143
159
  """Gets the starting message for the given message."""
144
160
  # We have to reset the color before the message, because sometimes if a
@@ -150,7 +166,8 @@ def starting_message(message: str) -> str:
150
166
  def finishing_message(message: str,
151
167
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
152
168
  is_local: bool = False,
153
- follow_up_message: Optional[str] = None) -> str:
169
+ follow_up_message: Optional[str] = None,
170
+ cluster_name: Optional[str] = None) -> str:
154
171
  """Gets the finishing message for the given message.
155
172
 
156
173
  Args:
@@ -168,6 +185,9 @@ def finishing_message(message: str,
168
185
  success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
169
186
  f'{message}{colorama.Style.RESET_ALL}{follow_up_message}'
170
187
  f'{colorama.Style.RESET_ALL}')
188
+ hint = provision_hint(cluster_name)
189
+ if hint:
190
+ return f'{success_prefix} {hint}'
171
191
  if log_path is None:
172
192
  return success_prefix
173
193
  path_hint = log_path_hint(log_path, is_local)
@@ -176,13 +196,17 @@ def finishing_message(message: str,
176
196
 
177
197
  def error_message(message: str,
178
198
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
179
- is_local: bool = False) -> str:
199
+ is_local: bool = False,
200
+ cluster_name: Optional[str] = None) -> str:
180
201
  """Gets the error message for the given message."""
181
202
  # We have to reset the color before the message, because sometimes if a
182
203
  # previous spinner with dimmed color overflows in a narrow terminal, the
183
204
  # color might be messed up.
184
205
  error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
185
206
  f'{colorama.Style.RESET_ALL} {message}')
207
+ hint = provision_hint(cluster_name)
208
+ if hint:
209
+ return f'{error_prefix} {hint}'
186
210
  if log_path is None:
187
211
  return error_prefix
188
212
  path_hint = log_path_hint(log_path, is_local)
@@ -200,9 +224,16 @@ def retry_message(message: str) -> str:
200
224
 
201
225
  def spinner_message(message: str,
202
226
  log_path: Optional[Union[str, 'pathlib.Path']] = None,
203
- is_local: bool = False) -> str:
204
- """Gets the spinner message for the given message and log path."""
227
+ is_local: bool = False,
228
+ cluster_name: Optional[str] = None) -> str:
229
+ """Gets the spinner message for the given message and log path.
230
+
231
+ If cluster_name is provided, recommend `sky logs --provision <cluster>`.
232
+ """
205
233
  colored_spinner = f'[bold cyan]{message}[/]'
234
+ hint = provision_hint(cluster_name)
235
+ if hint:
236
+ return f'{colored_spinner} {hint}'
206
237
  if log_path is None:
207
238
  return colored_spinner
208
239
  path_hint = log_path_hint(log_path, is_local)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250814
3
+ Version: 1.0.0.dev20250816
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0