skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (151) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/catalog/kubernetes_catalog.py +8 -0
  4. sky/catalog/nebius_catalog.py +0 -1
  5. sky/check.py +11 -1
  6. sky/client/cli/command.py +234 -100
  7. sky/client/sdk.py +30 -9
  8. sky/client/sdk_async.py +815 -0
  9. sky/clouds/kubernetes.py +6 -1
  10. sky/clouds/nebius.py +1 -4
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  26. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  30. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  49. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  50. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  51. sky/dashboard/out/clusters/[cluster].html +1 -1
  52. sky/dashboard/out/clusters.html +1 -1
  53. sky/dashboard/out/config.html +1 -1
  54. sky/dashboard/out/index.html +1 -1
  55. sky/dashboard/out/infra/[context].html +1 -1
  56. sky/dashboard/out/infra.html +1 -1
  57. sky/dashboard/out/jobs/[job].html +1 -1
  58. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/volumes.html +1 -1
  62. sky/dashboard/out/workspace/new.html +1 -1
  63. sky/dashboard/out/workspaces/[name].html +1 -1
  64. sky/dashboard/out/workspaces.html +1 -1
  65. sky/global_user_state.py +14 -2
  66. sky/jobs/__init__.py +2 -0
  67. sky/jobs/client/sdk.py +43 -2
  68. sky/jobs/client/sdk_async.py +135 -0
  69. sky/jobs/server/core.py +48 -1
  70. sky/jobs/server/server.py +52 -3
  71. sky/jobs/state.py +5 -1
  72. sky/jobs/utils.py +3 -1
  73. sky/provision/kubernetes/utils.py +30 -4
  74. sky/provision/nebius/instance.py +1 -0
  75. sky/provision/nebius/utils.py +9 -1
  76. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  77. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  78. sky/serve/client/impl.py +85 -1
  79. sky/serve/client/sdk.py +16 -47
  80. sky/serve/client/sdk_async.py +130 -0
  81. sky/serve/constants.py +3 -1
  82. sky/serve/controller.py +6 -3
  83. sky/serve/load_balancer.py +3 -1
  84. sky/serve/serve_state.py +93 -5
  85. sky/serve/serve_utils.py +200 -67
  86. sky/serve/server/core.py +13 -197
  87. sky/serve/server/impl.py +261 -23
  88. sky/serve/service.py +15 -3
  89. sky/server/auth/__init__.py +0 -0
  90. sky/server/auth/authn.py +46 -0
  91. sky/server/auth/oauth2_proxy.py +185 -0
  92. sky/server/common.py +119 -21
  93. sky/server/constants.py +1 -1
  94. sky/server/daemons.py +60 -11
  95. sky/server/requests/executor.py +5 -3
  96. sky/server/requests/payloads.py +19 -0
  97. sky/server/rest.py +114 -0
  98. sky/server/server.py +44 -40
  99. sky/setup_files/dependencies.py +2 -0
  100. sky/skylet/constants.py +1 -1
  101. sky/skylet/events.py +5 -1
  102. sky/skylet/skylet.py +3 -1
  103. sky/task.py +61 -21
  104. sky/templates/kubernetes-ray.yml.j2 +9 -0
  105. sky/templates/nebius-ray.yml.j2 +1 -0
  106. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  107. sky/usage/usage_lib.py +8 -6
  108. sky/utils/annotations.py +8 -3
  109. sky/utils/common_utils.py +11 -1
  110. sky/utils/controller_utils.py +7 -0
  111. sky/utils/db/migration_utils.py +2 -2
  112. sky/utils/rich_utils.py +120 -0
  113. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
  114. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
  115. sky/client/sdk.pyi +0 -300
  116. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  119. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  126. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  138. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  145. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  146. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  147. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
  148. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  149. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  150. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  151. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
@@ -707,6 +707,25 @@ class JobsPoolStatusBody(RequestBody):
707
707
  pool_names: Optional[Union[str, List[str]]]
708
708
 
709
709
 
710
+ class JobsPoolLogsBody(RequestBody):
711
+ """The request body for the jobs pool logs endpoint."""
712
+ pool_name: str
713
+ target: Union[str, serve.ServiceComponent]
714
+ worker_id: Optional[int] = None
715
+ follow: bool = True
716
+ tail: Optional[int] = None
717
+
718
+
719
+ class JobsPoolDownloadLogsBody(RequestBody):
720
+ """The request body for the jobs pool download logs endpoint."""
721
+ pool_name: str
722
+ local_dir: str
723
+ targets: Optional[Union[str, serve.ServiceComponent,
724
+ List[Union[str, serve.ServiceComponent]]]]
725
+ worker_ids: Optional[List[int]] = None
726
+ tail: Optional[int] = None
727
+
728
+
710
729
  class UploadZipFileResponse(pydantic.BaseModel):
711
730
  """The response body for the upload zip file endpoint."""
712
731
  status: str
sky/server/rest.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """REST API client of SkyPilot API server"""
2
2
 
3
+ import asyncio
3
4
  import contextlib
4
5
  import contextvars
5
6
  import functools
@@ -21,9 +22,11 @@ from sky.utils import ux_utils
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
23
24
  if typing.TYPE_CHECKING:
25
+ import aiohttp
24
26
  import requests
25
27
 
26
28
  else:
29
+ aiohttp = adaptors_common.LazyImport('aiohttp')
27
30
  requests = adaptors_common.LazyImport('requests')
28
31
 
29
32
  F = TypeVar('F', bound=Callable[..., Any])
@@ -204,3 +207,114 @@ def request_without_retry(method, url, **kwargs) -> 'requests.Response':
204
207
  if remote_version is not None:
205
208
  versions.set_remote_version(remote_version)
206
209
  return response
210
+
211
+
212
+ # Async versions of the above functions
213
+
214
+
215
+ async def request_async(session: 'aiohttp.ClientSession', method: str, url: str,
216
+ **kwargs) -> 'aiohttp.ClientResponse':
217
+ """Send an async request to the API server, retry on server temporarily
218
+ unavailable."""
219
+ max_retries = 3
220
+ initial_backoff = 1.0
221
+ max_backoff_factor = 5
222
+
223
+ backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
224
+ last_exception = Exception('Uknown Exception') # this will be replaced by e
225
+
226
+ for retry_count in range(max_retries):
227
+ try:
228
+ return await request_without_retry_async(session, method, url,
229
+ **kwargs)
230
+ except exceptions.RequestInterruptedError:
231
+ logger.debug('Request interrupted. Retry immediately.')
232
+ continue
233
+ except Exception as e: # pylint: disable=broad-except
234
+ last_exception = e
235
+ if retry_count >= max_retries - 1:
236
+ # Retries exhausted
237
+ raise
238
+
239
+ # Check if this is a transient error (similar to sync version logic)
240
+ is_transient = _is_transient_error_async(e)
241
+ if not is_transient:
242
+ # Permanent error, no need to retry
243
+ raise
244
+
245
+ logger.debug(f'Retry async request due to {e}, '
246
+ f'attempt {retry_count + 1}/{max_retries}')
247
+ await asyncio.sleep(backoff.current_backoff())
248
+
249
+ # This should never be reached, but just in case
250
+ raise last_exception
251
+
252
+
253
+ async def request_without_retry_async(session: 'aiohttp.ClientSession',
254
+ method: str, url: str,
255
+ **kwargs) -> 'aiohttp.ClientResponse':
256
+ """Send an async request to the API server without retry."""
257
+ # Add API version headers for compatibility (like sync version does)
258
+ if 'headers' not in kwargs:
259
+ kwargs['headers'] = {}
260
+ kwargs['headers'][constants.API_VERSION_HEADER] = str(constants.API_VERSION)
261
+ kwargs['headers'][constants.VERSION_HEADER] = (
262
+ versions.get_local_readable_version())
263
+
264
+ try:
265
+ response = await session.request(method, url, **kwargs)
266
+
267
+ # Handle server unavailability (503 status) - same as sync version
268
+ if response.status == 503:
269
+ with ux_utils.print_exception_no_traceback():
270
+ raise exceptions.ServerTemporarilyUnavailableError(
271
+ 'SkyPilot API server is temporarily unavailable. '
272
+ 'Please try again later.')
273
+
274
+ # Set remote API version and version from headers - same as sync version
275
+ remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
276
+ remote_version = response.headers.get(constants.VERSION_HEADER)
277
+ if remote_api_version is not None:
278
+ versions.set_remote_api_version(int(remote_api_version))
279
+ if remote_version is not None:
280
+ versions.set_remote_version(remote_version)
281
+
282
+ return response
283
+
284
+ except aiohttp.ClientError as e:
285
+ # Convert aiohttp errors to appropriate SkyPilot exceptions
286
+ if isinstance(e, aiohttp.ClientConnectorError):
287
+ raise exceptions.RequestInterruptedError(
288
+ f'Connection failed: {e}') from e
289
+ elif isinstance(e, aiohttp.ClientTimeout):
290
+ raise exceptions.RequestInterruptedError(
291
+ f'Request timeout: {e}') from e
292
+ else:
293
+ raise
294
+
295
+
296
+ def _is_transient_error_async(e: Exception) -> bool:
297
+ """Check if an exception from async request is transient and should be
298
+ retried.
299
+
300
+ Mirrors the logic from the sync version's is_transient_error().
301
+ """
302
+ if isinstance(e, aiohttp.ClientError):
303
+ # For response errors, check status code if available
304
+ if isinstance(e, aiohttp.ClientResponseError):
305
+ # Only server error is considered as transient (same as sync
306
+ # version)
307
+ return e.status >= 500
308
+ # Consider connection errors and timeouts as transient
309
+ if isinstance(e, (aiohttp.ClientConnectorError, aiohttp.ClientTimeout)):
310
+ return True
311
+
312
+ # Consider server temporarily unavailable as transient
313
+ if isinstance(e, exceptions.ServerTemporarilyUnavailableError):
314
+ return True
315
+
316
+ # It is hard to enumerate all other errors that are transient, e.g.
317
+ # broken pipe, connection refused, etc. Instead, it is safer to assume
318
+ # all other errors might be transient since we only retry for 3 times
319
+ # by default. (Same comment as in sync version)
320
+ return True
sky/server/server.py CHANGED
@@ -51,6 +51,8 @@ from sky.server import metrics
51
51
  from sky.server import state
52
52
  from sky.server import stream_utils
53
53
  from sky.server import versions
54
+ from sky.server.auth import authn
55
+ from sky.server.auth import oauth2_proxy
54
56
  from sky.server.requests import executor
55
57
  from sky.server.requests import payloads
56
58
  from sky.server.requests import preconditions
@@ -120,41 +122,6 @@ def _basic_auth_401_response(content: str):
120
122
  content=content)
121
123
 
122
124
 
123
- # TODO(hailong): Remove this function and use request.state.auth_user instead.
124
- async def _override_user_info_in_request_body(request: fastapi.Request,
125
- auth_user: Optional[models.User]):
126
- if auth_user is None:
127
- return
128
-
129
- body = await request.body()
130
- if body:
131
- try:
132
- original_json = await request.json()
133
- except (json.JSONDecodeError, UnicodeDecodeError) as e:
134
- logger.error(f'Error parsing request JSON: {e}')
135
- else:
136
- logger.debug(f'Overriding user for {request.state.request_id}: '
137
- f'{auth_user.name}, {auth_user.id}')
138
- if 'env_vars' in original_json:
139
- if isinstance(original_json.get('env_vars'), dict):
140
- original_json['env_vars'][
141
- constants.USER_ID_ENV_VAR] = auth_user.id
142
- original_json['env_vars'][
143
- constants.USER_ENV_VAR] = auth_user.name
144
- else:
145
- logger.warning(
146
- f'"env_vars" in request body is not a dictionary '
147
- f'for request {request.state.request_id}. '
148
- 'Skipping user info injection into body.')
149
- else:
150
- original_json['env_vars'] = {}
151
- original_json['env_vars'][
152
- constants.USER_ID_ENV_VAR] = auth_user.id
153
- original_json['env_vars'][
154
- constants.USER_ENV_VAR] = auth_user.name
155
- request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
156
-
157
-
158
125
  def _try_set_basic_auth_user(request: fastapi.Request):
159
126
  auth_header = request.headers.get('authorization')
160
127
  if not auth_header or not auth_header.lower().startswith('basic '):
@@ -281,7 +248,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
281
248
  apr_md5_crypt.verify(password, user.password)):
282
249
  valid_user = True
283
250
  request.state.auth_user = user
284
- await _override_user_info_in_request_body(request, user)
251
+ await authn.override_user_info_in_request_body(request, user)
285
252
  break
286
253
  if not valid_user:
287
254
  return _basic_auth_401_response('Invalid credentials')
@@ -400,7 +367,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
400
367
  request.state.auth_user = auth_user
401
368
 
402
369
  # Override user info in request body for service account requests
403
- await _override_user_info_in_request_body(request, auth_user)
370
+ await authn.override_user_info_in_request_body(request, auth_user)
404
371
 
405
372
  logger.debug(f'Authenticated service account: {user_id}')
406
373
 
@@ -445,7 +412,7 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
445
412
  if auth_user is not None:
446
413
  request.state.auth_user = auth_user
447
414
 
448
- await _override_user_info_in_request_body(request, auth_user)
415
+ await authn.override_user_info_in_request_body(request, auth_user)
449
416
  return await call_next(request)
450
417
 
451
418
 
@@ -484,6 +451,8 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
484
451
  del app # unused
485
452
  # Startup: Run background tasks
486
453
  for event in daemons.INTERNAL_REQUEST_DAEMONS:
454
+ if event.should_skip():
455
+ continue
487
456
  try:
488
457
  executor.schedule_request(
489
458
  request_id=event.id,
@@ -625,6 +594,8 @@ app.add_middleware(
625
594
  # RBACMiddleware must precede all the auth middleware, so it can access
626
595
  # request.state.auth_user.
627
596
  app.add_middleware(RBACMiddleware)
597
+ # Authentication based on oauth2-proxy.
598
+ app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
628
599
  # AuthProxyMiddleware should precede BasicAuthMiddleware and
629
600
  # BearerTokenMiddleware, since it should be skipped if either of those set the
630
601
  # auth user.
@@ -1574,9 +1545,42 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1574
1545
  - commit: str; The commit hash of SkyPilot used for API server.
1575
1546
  """
1576
1547
  user = request.state.auth_user
1577
- logger.info(f'Health endpoint: request.state.auth_user = {user}')
1548
+ server_status = common.ApiServerStatus.HEALTHY
1549
+ if getattr(request.state, 'anonymous_user', False):
1550
+ # API server authentication is enabled, but the request is not
1551
+ # authenticated. We still have to serve the request because the
1552
+ # /api/health endpoint has two different usage:
1553
+ # 1. For health check from `api start` and external ochestration
1554
+ # tools (k8s), which does not require authentication and user info.
1555
+ # 2. Return server info to client and hint client to login if required.
1556
+ # Separating these two usage to different APIs will break backward
1557
+ # compatibility for existing ochestration solutions (e.g. helm chart).
1558
+ # So we serve these two usages in a backward compatible manner below.
1559
+ client_version = versions.get_remote_api_version()
1560
+ # - For Client with API version >= 14, we return 200 response with
1561
+ # status=NEEDS_AUTH, new client will handle the login process.
1562
+ # - For health check from `sky api start`, the client code always uses
1563
+ # the same API version with the server, thus there is no compatibility
1564
+ # issue.
1565
+ server_status = common.ApiServerStatus.NEEDS_AUTH
1566
+ if client_version is None:
1567
+ # - For health check from ochestration tools (e.g. k8s), we also
1568
+ # return 200 with status=NEEDS_AUTH, which passes HTTP probe
1569
+ # check.
1570
+ # - There is no harm when an malicious client calls /api/health
1571
+ # without authentication since no sensitive information is
1572
+ # returned.
1573
+ return {'status': common.ApiServerStatus.HEALTHY}
1574
+ # TODO(aylei): remove this after min_compatible_api_version >= 14.
1575
+ if client_version < 14:
1576
+ # For Client with API version < 14, the NEEDS_AUTH status is not
1577
+ # honored. Return 401 to trigger the login process.
1578
+ raise fastapi.HTTPException(status_code=401,
1579
+ detail='Authentication required')
1580
+
1581
+ logger.debug(f'Health endpoint: request.state.auth_user = {user}')
1578
1582
  return {
1579
- 'status': common.ApiServerStatus.HEALTHY.value,
1583
+ 'status': server_status,
1580
1584
  # Kept for backward compatibility, clients before 0.11.0 will read this
1581
1585
  # field to check compatibility and hint the user to upgrade the CLI.
1582
1586
  # TODO(aylei): remove this field after 0.13.0
@@ -69,6 +69,7 @@ install_requires = [
69
69
  'gitpython',
70
70
  'types-paramiko',
71
71
  'alembic',
72
+ 'aiohttp',
72
73
  ]
73
74
 
74
75
  server_dependencies = [
@@ -76,6 +77,7 @@ server_dependencies = [
76
77
  'sqlalchemy_adapter',
77
78
  'passlib',
78
79
  'pyjwt',
80
+ 'aiohttp',
79
81
  ]
80
82
 
81
83
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -406,7 +406,7 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
406
406
  PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
407
407
  '~/.sky/.controller_recovery_restarting_signal')
408
408
 
409
- HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
409
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
410
410
 
411
411
  # The placeholder for the local skypilot config path in file mounts for
412
412
  # controllers.
sky/skylet/events.py CHANGED
@@ -96,8 +96,12 @@ class ServiceUpdateEvent(SkyletEvent):
96
96
  """
97
97
  EVENT_INTERVAL_SECONDS = 300
98
98
 
99
+ def __init__(self, pool: bool) -> None:
100
+ super().__init__()
101
+ self._pool = pool
102
+
99
103
  def _run(self):
100
- serve_utils.update_service_status()
104
+ serve_utils.update_service_status(self._pool)
101
105
 
102
106
 
103
107
  class UsageHeartbeatReportEvent(SkyletEvent):
sky/skylet/skylet.py CHANGED
@@ -24,7 +24,9 @@ EVENTS = [
24
24
  # This is for monitoring controller job status. If it becomes
25
25
  # unhealthy, this event will correctly update the controller
26
26
  # status to CONTROLLER_FAILED.
27
- events.ServiceUpdateEvent(),
27
+ events.ServiceUpdateEvent(pool=False),
28
+ # Status refresh for pool.
29
+ events.ServiceUpdateEvent(pool=True),
28
30
  # Report usage heartbeat every 10 minutes.
29
31
  events.UsageHeartbeatReportEvent(),
30
32
  ]
sky/task.py CHANGED
@@ -241,21 +241,26 @@ class Task:
241
241
  self,
242
242
  name: Optional[str] = None,
243
243
  *,
244
- setup: Optional[str] = None,
245
- run: Optional[CommandOrCommandGen] = None,
244
+ setup: Optional[Union[str, List[str]]] = None,
245
+ run: Optional[Union[CommandOrCommandGen, List[str]]] = None,
246
246
  envs: Optional[Dict[str, str]] = None,
247
247
  secrets: Optional[Dict[str, str]] = None,
248
248
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
249
249
  num_nodes: Optional[int] = None,
250
+ file_mounts: Optional[Dict[str, str]] = None,
251
+ storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
250
252
  volumes: Optional[Dict[str, str]] = None,
253
+ resources: Optional[Union['resources_lib.Resources',
254
+ List['resources_lib.Resources'],
255
+ Set['resources_lib.Resources']]] = None,
251
256
  # Advanced:
252
257
  docker_image: Optional[str] = None,
253
258
  event_callback: Optional[str] = None,
254
259
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
255
260
  # Internal use only.
256
- file_mounts_mapping: Optional[Dict[str, str]] = None,
257
- volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
258
- metadata: Optional[Dict[str, Any]] = None,
261
+ _file_mounts_mapping: Optional[Dict[str, str]] = None,
262
+ _volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
263
+ _metadata: Optional[Dict[str, Any]] = None,
259
264
  _user_specified_yaml: Optional[str] = None,
260
265
  ):
261
266
  """Initializes a Task.
@@ -288,15 +293,15 @@ class Task:
288
293
 
289
294
  Args:
290
295
  name: A string name for the Task for display purposes.
291
- setup: A setup command, which will be run before executing the run
296
+ setup: A setup command(s), which will be run before executing the run
292
297
  commands ``run``, and executed under ``workdir``.
293
298
  run: The actual command for the task. If not None, either a shell
294
- command (str) or a command generator (callable). If latter, it
295
- must take a node rank and a list of node addresses as input and
296
- return a shell command (str) (valid to return None for some nodes,
297
- in which case no commands are run on them). Run commands will be
298
- run under ``workdir``. Note the command generator should be a
299
- self-contained lambda.
299
+ command(s) (str, list(str)) or a command generator (callable). If
300
+ latter, it must take a node rank and a list of node addresses as
301
+ input and return a shell command (str) (valid to return None for
302
+ some nodes, in which case no commands are run on them). Run
303
+ commands will be run under ``workdir``. Note the command generator
304
+ should be a self-contained lambda.
300
305
  envs: A dictionary of environment variables to set before running the
301
306
  setup and run commands.
302
307
  secrets: A dictionary of secret environment variables to set before
@@ -315,22 +320,49 @@ class Task:
315
320
  setup/run command, where ``run`` can either be a str, meaning all
316
321
  nodes get the same command, or a lambda, with the semantics
317
322
  documented above.
323
+ file_mounts: An optional dict of ``{remote_path: (local_path|cloud
324
+ URI)}``, where remote means the VM(s) on which this Task will
325
+ eventually run on, and local means the node from which the task is
326
+ launched.
327
+ storage_mounts: an optional dict of ``{mount_path: sky.Storage
328
+ object}``, where mount_path is the path inside the remote VM(s)
329
+ where the Storage object will be mounted on.
330
+ volumes: A dict of volumes to be mounted for the task. The dict has
331
+ the form of ``{mount_path: volume_name}``.
332
+ resources: either a sky.Resources, a set of them, or a list of them.
333
+ A set or a list of resources asks the optimizer to "pick the
334
+ best of these resources" to run this task.
318
335
  docker_image: (EXPERIMENTAL: Only in effect when LocalDockerBackend
319
336
  is used.) The base docker image that this Task will be built on.
320
337
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
338
+ event_callback: A bash script that will be executed when the task
339
+ changes state.
321
340
  blocked_resources: A set of resources that this task cannot run on.
322
- metadata: A dictionary of metadata to be added to the task.
341
+ _file_mounts_mapping: (Internal use only) A dictionary of file mounts
342
+ mapping.
343
+ _volume_mounts: (Internal use only) A list of volume mounts.
344
+ _metadata: (Internal use only) A dictionary of metadata to be added to
345
+ the task.
346
+ _user_specified_yaml: (Internal use only) A string of user-specified
347
+ YAML config.
323
348
  """
324
349
  self.name = name
325
- self.run = run
326
350
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
327
351
  self.storage_plans: Dict[storage_lib.Storage,
328
352
  storage_lib.StoreType] = {}
329
- self.setup = setup
330
353
  self._envs = envs or {}
331
354
  self._secrets = secrets or {}
332
355
  self._volumes = volumes or {}
333
356
 
357
+ # concatenate commands if given as list
358
+ def _concat(commands):
359
+ if isinstance(commands, list):
360
+ return '\n'.join(commands)
361
+ return commands
362
+
363
+ self.run = _concat(run)
364
+ self.setup = _concat(setup)
365
+
334
366
  # Validate Docker login configuration early if both envs and secrets
335
367
  # contain Docker variables
336
368
  if self._envs or self._secrets:
@@ -372,11 +404,19 @@ class Task:
372
404
  self.best_resources: Optional[sky.Resources] = None
373
405
 
374
406
  # For internal use only.
375
- self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
407
+ self.file_mounts_mapping: Optional[Dict[str,
408
+ str]] = _file_mounts_mapping
376
409
  self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
377
- volume_mounts)
410
+ _volume_mounts)
378
411
 
379
- self._metadata = metadata if metadata is not None else {}
412
+ self._metadata = _metadata if _metadata is not None else {}
413
+
414
+ if resources is not None:
415
+ self.set_resources(resources)
416
+ if storage_mounts is not None:
417
+ self.set_storage_mounts(storage_mounts)
418
+ if file_mounts is not None:
419
+ self.set_file_mounts(file_mounts)
380
420
 
381
421
  dag = sky.dag.get_current_dag()
382
422
  if dag is not None:
@@ -621,10 +661,10 @@ class Task:
621
661
  num_nodes=config.pop('num_nodes', None),
622
662
  envs=config.pop('envs', None),
623
663
  secrets=config.pop('secrets', None),
624
- event_callback=config.pop('event_callback', None),
625
- file_mounts_mapping=config.pop('file_mounts_mapping', None),
626
664
  volumes=config.pop('volumes', None),
627
- metadata=config.pop('_metadata', None),
665
+ event_callback=config.pop('event_callback', None),
666
+ _file_mounts_mapping=config.pop('file_mounts_mapping', None),
667
+ _metadata=config.pop('_metadata', None),
628
668
  _user_specified_yaml=user_specified_yaml,
629
669
  )
630
670
 
@@ -777,6 +777,15 @@ available_node_types:
777
777
  {{ ray_installation_commands }}
778
778
 
779
779
  VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
780
+ # Wait for `patch` package to be installed before applying ray patches
781
+ until dpkg -l | grep -q "^ii patch "; do
782
+ sleep 0.1
783
+ echo "Waiting for patch package to be installed..."
784
+ done
785
+ # Apply Ray patches for progress bar fix
786
+ ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
787
+ VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
788
+ }
780
789
  touch /tmp/ray_skypilot_installation_complete
781
790
  echo "=== Ray and skypilot installation completed ==="
782
791
 
@@ -46,6 +46,7 @@ available_node_types:
46
46
  InstanceType: {{instance_type}}
47
47
  ImageId: {{image_id}}
48
48
  DiskSize: {{disk_size}}
49
+ use_spot: {{ use_spot }}
49
50
  network_tier: {{network_tier}}
50
51
  filesystems:
51
52
  {%- for fs in filesystems %}
@@ -57,6 +57,7 @@ run: |
57
57
  -u -m sky.serve.service \
58
58
  --service-name {{service_name}} \
59
59
  --task-yaml {{remote_task_yaml_path}} \
60
+ --entrypoint {{entrypoint}} \
60
61
  {%- if consolidation_mode_job_id is not none %}
61
62
  --job-id {{consolidation_mode_job_id}} \
62
63
  {%- else %}
sky/usage/usage_lib.py CHANGED
@@ -10,6 +10,8 @@ import traceback
10
10
  import typing
11
11
  from typing import Any, Callable, Dict, List, Optional, Union
12
12
 
13
+ from typing_extensions import ParamSpec
14
+
13
15
  import sky
14
16
  from sky import sky_logging
15
17
  from sky.adaptors import common as adaptors_common
@@ -517,26 +519,26 @@ def entrypoint_context(name: str, fallback: bool = False):
517
519
 
518
520
 
519
521
  T = typing.TypeVar('T')
522
+ P = ParamSpec('P')
520
523
 
521
524
 
522
525
  @typing.overload
523
526
  def entrypoint(
524
527
  name_or_fn: str,
525
- fallback: bool = False
526
- ) -> Callable[[Callable[..., T]], Callable[..., T]]:
528
+ fallback: bool = False) -> Callable[[Callable[P, T]], Callable[P, T]]:
527
529
  ...
528
530
 
529
531
 
530
532
  @typing.overload
531
- def entrypoint(name_or_fn: Callable[..., T],
532
- fallback: bool = False) -> Callable[..., T]:
533
+ def entrypoint(name_or_fn: Callable[P, T],
534
+ fallback: bool = False) -> Callable[P, T]:
533
535
  ...
534
536
 
535
537
 
536
538
  def entrypoint(
537
- name_or_fn: Union[str, Callable[..., T]],
539
+ name_or_fn: Union[str, Callable[P, T]],
538
540
  fallback: bool = False
539
- ) -> Union[Callable[..., T], Callable[[Callable[..., T]], Callable[..., T]]]:
541
+ ) -> Union[Callable[P, T], Callable[[Callable[P, T]], Callable[P, T]]]:
540
542
  return common_utils.make_decorator(entrypoint_context,
541
543
  name_or_fn,
542
544
  fallback=fallback)
sky/utils/annotations.py CHANGED
@@ -1,14 +1,19 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal
4
+ from typing import Callable, Literal, TypeVar
5
+
6
+ from typing_extensions import ParamSpec
5
7
 
6
8
  # Whether the current process is a SkyPilot API server process.
7
9
  is_on_api_server = True
8
10
  FUNCTIONS_NEED_RELOAD_CACHE = []
9
11
 
12
+ T = TypeVar('T')
13
+ P = ParamSpec('P')
14
+
10
15
 
11
- def client_api(func):
16
+ def client_api(func: Callable[P, T]) -> Callable[P, T]:
12
17
  """Mark a function as a client-side API.
13
18
 
14
19
  Code invoked by server-side functions will find annotations.is_on_api_server
@@ -38,7 +43,7 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
38
43
  lru_cache_kwargs: Keyword arguments for functools.lru_cache.
39
44
  """
40
45
 
41
- def decorator(func: Callable) -> Callable:
46
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
42
47
  if scope == 'global':
43
48
  return functools.lru_cache(*lru_cache_args,
44
49
  **lru_cache_kwargs)(func)
sky/utils/common_utils.py CHANGED
@@ -271,12 +271,13 @@ _current_command: Optional[str] = None
271
271
  _current_client_entrypoint: Optional[str] = None
272
272
  _using_remote_api_server: Optional[bool] = None
273
273
  _current_user: Optional['models.User'] = None
274
+ _current_request_id: Optional[str] = None
274
275
 
275
276
 
276
277
  def set_request_context(client_entrypoint: Optional[str],
277
278
  client_command: Optional[str],
278
279
  using_remote_api_server: bool,
279
- user: Optional['models.User']):
280
+ user: Optional['models.User'], request_id: str) -> None:
280
281
  """Override the current client entrypoint and command.
281
282
 
282
283
  This is useful when we are on the SkyPilot API server side and we have a
@@ -286,10 +287,19 @@ def set_request_context(client_entrypoint: Optional[str],
286
287
  global _current_client_entrypoint
287
288
  global _using_remote_api_server
288
289
  global _current_user
290
+ global _current_request_id
289
291
  _current_command = client_command
290
292
  _current_client_entrypoint = client_entrypoint
291
293
  _using_remote_api_server = using_remote_api_server
292
294
  _current_user = user
295
+ _current_request_id = request_id
296
+
297
+
298
+ def get_current_request_id() -> str:
299
+ """Returns the current request id."""
300
+ if _current_request_id is not None:
301
+ return _current_request_id
302
+ return 'dummy-request-id'
293
303
 
294
304
 
295
305
  def get_current_command() -> str:
@@ -209,6 +209,13 @@ class Controllers(enum.Enum):
209
209
  return None
210
210
 
211
211
 
212
+ def get_controller_for_pool(pool: bool) -> Controllers:
213
+ """Get the controller type."""
214
+ if pool:
215
+ return Controllers.JOBS_CONTROLLER
216
+ return Controllers.SKY_SERVE_CONTROLLER
217
+
218
+
212
219
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
213
220
  """Check if the controller high availability is specified in user config.
214
221
  """
@@ -19,11 +19,11 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '001'
22
+ GLOBAL_USER_STATE_VERSION = '002'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
26
- SPOT_JOBS_VERSION = '002'
26
+ SPOT_JOBS_VERSION = '003'
27
27
  SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
28
28
 
29
29