skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/kubernetes_catalog.py +8 -0
  3. sky/catalog/nebius_catalog.py +0 -1
  4. sky/client/cli/command.py +26 -7
  5. sky/client/sdk.py +16 -8
  6. sky/client/sdk.pyi +6 -5
  7. sky/client/sdk_async.py +811 -0
  8. sky/clouds/kubernetes.py +6 -1
  9. sky/clouds/nebius.py +1 -4
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
  32. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  33. sky/dashboard/out/clusters/[cluster].html +1 -1
  34. sky/dashboard/out/clusters.html +1 -1
  35. sky/dashboard/out/config.html +1 -1
  36. sky/dashboard/out/index.html +1 -1
  37. sky/dashboard/out/infra/[context].html +1 -1
  38. sky/dashboard/out/infra.html +1 -1
  39. sky/dashboard/out/jobs/[job].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/jobs/client/sdk_async.py +135 -0
  47. sky/jobs/utils.py +3 -1
  48. sky/provision/kubernetes/utils.py +30 -4
  49. sky/provision/nebius/instance.py +1 -0
  50. sky/provision/nebius/utils.py +9 -1
  51. sky/serve/client/sdk_async.py +130 -0
  52. sky/serve/constants.py +2 -1
  53. sky/serve/controller.py +2 -1
  54. sky/serve/load_balancer.py +3 -1
  55. sky/serve/serve_state.py +70 -5
  56. sky/serve/serve_utils.py +124 -22
  57. sky/serve/server/impl.py +22 -21
  58. sky/serve/service.py +8 -1
  59. sky/server/auth/__init__.py +0 -0
  60. sky/server/auth/authn.py +46 -0
  61. sky/server/auth/oauth2_proxy.py +185 -0
  62. sky/server/common.py +108 -17
  63. sky/server/constants.py +1 -1
  64. sky/server/daemons.py +60 -11
  65. sky/server/rest.py +114 -0
  66. sky/server/server.py +44 -40
  67. sky/setup_files/dependencies.py +2 -0
  68. sky/skylet/constants.py +1 -1
  69. sky/skylet/events.py +5 -1
  70. sky/skylet/skylet.py +3 -1
  71. sky/task.py +43 -10
  72. sky/templates/kubernetes-ray.yml.j2 +4 -0
  73. sky/templates/nebius-ray.yml.j2 +1 -0
  74. sky/utils/controller_utils.py +7 -0
  75. sky/utils/rich_utils.py +120 -0
  76. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
  77. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +86 -81
  78. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  81. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
  85. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  90. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  95. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
  96. /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
  97. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
  98. /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
  99. /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
  100. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
  101. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
  102. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
  103. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
sky/server/rest.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """REST API client of SkyPilot API server"""
2
2
 
3
+ import asyncio
3
4
  import contextlib
4
5
  import contextvars
5
6
  import functools
@@ -21,9 +22,11 @@ from sky.utils import ux_utils
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
23
24
  if typing.TYPE_CHECKING:
25
+ import aiohttp
24
26
  import requests
25
27
 
26
28
  else:
29
+ aiohttp = adaptors_common.LazyImport('aiohttp')
27
30
  requests = adaptors_common.LazyImport('requests')
28
31
 
29
32
  F = TypeVar('F', bound=Callable[..., Any])
@@ -204,3 +207,114 @@ def request_without_retry(method, url, **kwargs) -> 'requests.Response':
204
207
  if remote_version is not None:
205
208
  versions.set_remote_version(remote_version)
206
209
  return response
210
+
211
+
212
+ # Async versions of the above functions
213
+
214
+
215
+ async def request_async(session: 'aiohttp.ClientSession', method: str, url: str,
216
+ **kwargs) -> 'aiohttp.ClientResponse':
217
+ """Send an async request to the API server, retry on server temporarily
218
+ unavailable."""
219
+ max_retries = 3
220
+ initial_backoff = 1.0
221
+ max_backoff_factor = 5
222
+
223
+ backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
224
+ last_exception = Exception('Uknown Exception') # this will be replaced by e
225
+
226
+ for retry_count in range(max_retries):
227
+ try:
228
+ return await request_without_retry_async(session, method, url,
229
+ **kwargs)
230
+ except exceptions.RequestInterruptedError:
231
+ logger.debug('Request interrupted. Retry immediately.')
232
+ continue
233
+ except Exception as e: # pylint: disable=broad-except
234
+ last_exception = e
235
+ if retry_count >= max_retries - 1:
236
+ # Retries exhausted
237
+ raise
238
+
239
+ # Check if this is a transient error (similar to sync version logic)
240
+ is_transient = _is_transient_error_async(e)
241
+ if not is_transient:
242
+ # Permanent error, no need to retry
243
+ raise
244
+
245
+ logger.debug(f'Retry async request due to {e}, '
246
+ f'attempt {retry_count + 1}/{max_retries}')
247
+ await asyncio.sleep(backoff.current_backoff())
248
+
249
+ # This should never be reached, but just in case
250
+ raise last_exception
251
+
252
+
253
+ async def request_without_retry_async(session: 'aiohttp.ClientSession',
254
+ method: str, url: str,
255
+ **kwargs) -> 'aiohttp.ClientResponse':
256
+ """Send an async request to the API server without retry."""
257
+ # Add API version headers for compatibility (like sync version does)
258
+ if 'headers' not in kwargs:
259
+ kwargs['headers'] = {}
260
+ kwargs['headers'][constants.API_VERSION_HEADER] = str(constants.API_VERSION)
261
+ kwargs['headers'][constants.VERSION_HEADER] = (
262
+ versions.get_local_readable_version())
263
+
264
+ try:
265
+ response = await session.request(method, url, **kwargs)
266
+
267
+ # Handle server unavailability (503 status) - same as sync version
268
+ if response.status == 503:
269
+ with ux_utils.print_exception_no_traceback():
270
+ raise exceptions.ServerTemporarilyUnavailableError(
271
+ 'SkyPilot API server is temporarily unavailable. '
272
+ 'Please try again later.')
273
+
274
+ # Set remote API version and version from headers - same as sync version
275
+ remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
276
+ remote_version = response.headers.get(constants.VERSION_HEADER)
277
+ if remote_api_version is not None:
278
+ versions.set_remote_api_version(int(remote_api_version))
279
+ if remote_version is not None:
280
+ versions.set_remote_version(remote_version)
281
+
282
+ return response
283
+
284
+ except aiohttp.ClientError as e:
285
+ # Convert aiohttp errors to appropriate SkyPilot exceptions
286
+ if isinstance(e, aiohttp.ClientConnectorError):
287
+ raise exceptions.RequestInterruptedError(
288
+ f'Connection failed: {e}') from e
289
+ elif isinstance(e, aiohttp.ClientTimeout):
290
+ raise exceptions.RequestInterruptedError(
291
+ f'Request timeout: {e}') from e
292
+ else:
293
+ raise
294
+
295
+
296
+ def _is_transient_error_async(e: Exception) -> bool:
297
+ """Check if an exception from async request is transient and should be
298
+ retried.
299
+
300
+ Mirrors the logic from the sync version's is_transient_error().
301
+ """
302
+ if isinstance(e, aiohttp.ClientError):
303
+ # For response errors, check status code if available
304
+ if isinstance(e, aiohttp.ClientResponseError):
305
+ # Only server error is considered as transient (same as sync
306
+ # version)
307
+ return e.status >= 500
308
+ # Consider connection errors and timeouts as transient
309
+ if isinstance(e, (aiohttp.ClientConnectorError, aiohttp.ClientTimeout)):
310
+ return True
311
+
312
+ # Consider server temporarily unavailable as transient
313
+ if isinstance(e, exceptions.ServerTemporarilyUnavailableError):
314
+ return True
315
+
316
+ # It is hard to enumerate all other errors that are transient, e.g.
317
+ # broken pipe, connection refused, etc. Instead, it is safer to assume
318
+ # all other errors might be transient since we only retry for 3 times
319
+ # by default. (Same comment as in sync version)
320
+ return True
sky/server/server.py CHANGED
@@ -51,6 +51,8 @@ from sky.server import metrics
51
51
  from sky.server import state
52
52
  from sky.server import stream_utils
53
53
  from sky.server import versions
54
+ from sky.server.auth import authn
55
+ from sky.server.auth import oauth2_proxy
54
56
  from sky.server.requests import executor
55
57
  from sky.server.requests import payloads
56
58
  from sky.server.requests import preconditions
@@ -120,41 +122,6 @@ def _basic_auth_401_response(content: str):
120
122
  content=content)
121
123
 
122
124
 
123
- # TODO(hailong): Remove this function and use request.state.auth_user instead.
124
- async def _override_user_info_in_request_body(request: fastapi.Request,
125
- auth_user: Optional[models.User]):
126
- if auth_user is None:
127
- return
128
-
129
- body = await request.body()
130
- if body:
131
- try:
132
- original_json = await request.json()
133
- except (json.JSONDecodeError, UnicodeDecodeError) as e:
134
- logger.error(f'Error parsing request JSON: {e}')
135
- else:
136
- logger.debug(f'Overriding user for {request.state.request_id}: '
137
- f'{auth_user.name}, {auth_user.id}')
138
- if 'env_vars' in original_json:
139
- if isinstance(original_json.get('env_vars'), dict):
140
- original_json['env_vars'][
141
- constants.USER_ID_ENV_VAR] = auth_user.id
142
- original_json['env_vars'][
143
- constants.USER_ENV_VAR] = auth_user.name
144
- else:
145
- logger.warning(
146
- f'"env_vars" in request body is not a dictionary '
147
- f'for request {request.state.request_id}. '
148
- 'Skipping user info injection into body.')
149
- else:
150
- original_json['env_vars'] = {}
151
- original_json['env_vars'][
152
- constants.USER_ID_ENV_VAR] = auth_user.id
153
- original_json['env_vars'][
154
- constants.USER_ENV_VAR] = auth_user.name
155
- request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
156
-
157
-
158
125
  def _try_set_basic_auth_user(request: fastapi.Request):
159
126
  auth_header = request.headers.get('authorization')
160
127
  if not auth_header or not auth_header.lower().startswith('basic '):
@@ -281,7 +248,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
281
248
  apr_md5_crypt.verify(password, user.password)):
282
249
  valid_user = True
283
250
  request.state.auth_user = user
284
- await _override_user_info_in_request_body(request, user)
251
+ await authn.override_user_info_in_request_body(request, user)
285
252
  break
286
253
  if not valid_user:
287
254
  return _basic_auth_401_response('Invalid credentials')
@@ -400,7 +367,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
400
367
  request.state.auth_user = auth_user
401
368
 
402
369
  # Override user info in request body for service account requests
403
- await _override_user_info_in_request_body(request, auth_user)
370
+ await authn.override_user_info_in_request_body(request, auth_user)
404
371
 
405
372
  logger.debug(f'Authenticated service account: {user_id}')
406
373
 
@@ -445,7 +412,7 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
445
412
  if auth_user is not None:
446
413
  request.state.auth_user = auth_user
447
414
 
448
- await _override_user_info_in_request_body(request, auth_user)
415
+ await authn.override_user_info_in_request_body(request, auth_user)
449
416
  return await call_next(request)
450
417
 
451
418
 
@@ -484,6 +451,8 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
484
451
  del app # unused
485
452
  # Startup: Run background tasks
486
453
  for event in daemons.INTERNAL_REQUEST_DAEMONS:
454
+ if event.should_skip():
455
+ continue
487
456
  try:
488
457
  executor.schedule_request(
489
458
  request_id=event.id,
@@ -625,6 +594,8 @@ app.add_middleware(
625
594
  # RBACMiddleware must precede all the auth middleware, so it can access
626
595
  # request.state.auth_user.
627
596
  app.add_middleware(RBACMiddleware)
597
+ # Authentication based on oauth2-proxy.
598
+ app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
628
599
  # AuthProxyMiddleware should precede BasicAuthMiddleware and
629
600
  # BearerTokenMiddleware, since it should be skipped if either of those set the
630
601
  # auth user.
@@ -1574,9 +1545,42 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1574
1545
  - commit: str; The commit hash of SkyPilot used for API server.
1575
1546
  """
1576
1547
  user = request.state.auth_user
1577
- logger.info(f'Health endpoint: request.state.auth_user = {user}')
1548
+ server_status = common.ApiServerStatus.HEALTHY
1549
+ if getattr(request.state, 'anonymous_user', False):
1550
+ # API server authentication is enabled, but the request is not
1551
+ # authenticated. We still have to serve the request because the
1552
+ # /api/health endpoint has two different usage:
1553
+ # 1. For health check from `api start` and external ochestration
1554
+ # tools (k8s), which does not require authentication and user info.
1555
+ # 2. Return server info to client and hint client to login if required.
1556
+ # Separating these two usage to different APIs will break backward
1557
+ # compatibility for existing ochestration solutions (e.g. helm chart).
1558
+ # So we serve these two usages in a backward compatible manner below.
1559
+ client_version = versions.get_remote_api_version()
1560
+ # - For Client with API version >= 14, we return 200 response with
1561
+ # status=NEEDS_AUTH, new client will handle the login process.
1562
+ # - For health check from `sky api start`, the client code always uses
1563
+ # the same API version with the server, thus there is no compatibility
1564
+ # issue.
1565
+ server_status = common.ApiServerStatus.NEEDS_AUTH
1566
+ if client_version is None:
1567
+ # - For health check from ochestration tools (e.g. k8s), we also
1568
+ # return 200 with status=NEEDS_AUTH, which passes HTTP probe
1569
+ # check.
1570
+ # - There is no harm when an malicious client calls /api/health
1571
+ # without authentication since no sensitive information is
1572
+ # returned.
1573
+ return {'status': common.ApiServerStatus.HEALTHY}
1574
+ # TODO(aylei): remove this after min_compatible_api_version >= 14.
1575
+ if client_version < 14:
1576
+ # For Client with API version < 14, the NEEDS_AUTH status is not
1577
+ # honored. Return 401 to trigger the login process.
1578
+ raise fastapi.HTTPException(status_code=401,
1579
+ detail='Authentication required')
1580
+
1581
+ logger.debug(f'Health endpoint: request.state.auth_user = {user}')
1578
1582
  return {
1579
- 'status': common.ApiServerStatus.HEALTHY.value,
1583
+ 'status': server_status,
1580
1584
  # Kept for backward compatibility, clients before 0.11.0 will read this
1581
1585
  # field to check compatibility and hint the user to upgrade the CLI.
1582
1586
  # TODO(aylei): remove this field after 0.13.0
@@ -69,6 +69,7 @@ install_requires = [
69
69
  'gitpython',
70
70
  'types-paramiko',
71
71
  'alembic',
72
+ 'aiohttp',
72
73
  ]
73
74
 
74
75
  server_dependencies = [
@@ -76,6 +77,7 @@ server_dependencies = [
76
77
  'sqlalchemy_adapter',
77
78
  'passlib',
78
79
  'pyjwt',
80
+ 'aiohttp',
79
81
  ]
80
82
 
81
83
  local_ray = [
sky/skylet/constants.py CHANGED
@@ -406,7 +406,7 @@ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
406
406
  PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
407
407
  '~/.sky/.controller_recovery_restarting_signal')
408
408
 
409
- HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/ha_recovery.log'
409
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
410
410
 
411
411
  # The placeholder for the local skypilot config path in file mounts for
412
412
  # controllers.
sky/skylet/events.py CHANGED
@@ -96,8 +96,12 @@ class ServiceUpdateEvent(SkyletEvent):
96
96
  """
97
97
  EVENT_INTERVAL_SECONDS = 300
98
98
 
99
+ def __init__(self, pool: bool) -> None:
100
+ super().__init__()
101
+ self._pool = pool
102
+
99
103
  def _run(self):
100
- serve_utils.update_service_status()
104
+ serve_utils.update_service_status(self._pool)
101
105
 
102
106
 
103
107
  class UsageHeartbeatReportEvent(SkyletEvent):
sky/skylet/skylet.py CHANGED
@@ -24,7 +24,9 @@ EVENTS = [
24
24
  # This is for monitoring controller job status. If it becomes
25
25
  # unhealthy, this event will correctly update the controller
26
26
  # status to CONTROLLER_FAILED.
27
- events.ServiceUpdateEvent(),
27
+ events.ServiceUpdateEvent(pool=False),
28
+ # Status refresh for pool.
29
+ events.ServiceUpdateEvent(pool=True),
28
30
  # Report usage heartbeat every 10 minutes.
29
31
  events.UsageHeartbeatReportEvent(),
30
32
  ]
sky/task.py CHANGED
@@ -247,15 +247,20 @@ class Task:
247
247
  secrets: Optional[Dict[str, str]] = None,
248
248
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
249
249
  num_nodes: Optional[int] = None,
250
+ file_mounts: Optional[Dict[str, str]] = None,
251
+ storage_mounts: Optional[Dict[str, storage_lib.Storage]] = None,
250
252
  volumes: Optional[Dict[str, str]] = None,
253
+ resources: Optional[Union['resources_lib.Resources',
254
+ List['resources_lib.Resources'],
255
+ Set['resources_lib.Resources']]] = None,
251
256
  # Advanced:
252
257
  docker_image: Optional[str] = None,
253
258
  event_callback: Optional[str] = None,
254
259
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
255
260
  # Internal use only.
256
- file_mounts_mapping: Optional[Dict[str, str]] = None,
257
- volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
258
- metadata: Optional[Dict[str, Any]] = None,
261
+ _file_mounts_mapping: Optional[Dict[str, str]] = None,
262
+ _volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
263
+ _metadata: Optional[Dict[str, Any]] = None,
259
264
  _user_specified_yaml: Optional[str] = None,
260
265
  ):
261
266
  """Initializes a Task.
@@ -315,11 +320,31 @@ class Task:
315
320
  setup/run command, where ``run`` can either be a str, meaning all
316
321
  nodes get the same command, or a lambda, with the semantics
317
322
  documented above.
323
+ file_mounts: An optional dict of ``{remote_path: (local_path|cloud
324
+ URI)}``, where remote means the VM(s) on which this Task will
325
+ eventually run on, and local means the node from which the task is
326
+ launched.
327
+ storage_mounts: an optional dict of ``{mount_path: sky.Storage
328
+ object}``, where mount_path is the path inside the remote VM(s)
329
+ where the Storage object will be mounted on.
330
+ volumes: A dict of volumes to be mounted for the task. The dict has
331
+ the form of ``{mount_path: volume_name}``.
332
+ resources: either a sky.Resources, a set of them, or a list of them.
333
+ A set or a list of resources asks the optimizer to "pick the
334
+ best of these resources" to run this task.
318
335
  docker_image: (EXPERIMENTAL: Only in effect when LocalDockerBackend
319
336
  is used.) The base docker image that this Task will be built on.
320
337
  Defaults to 'gpuci/miniforge-cuda:11.4-devel-ubuntu18.04'.
338
+ event_callback: A bash script that will be executed when the task
339
+ changes state.
321
340
  blocked_resources: A set of resources that this task cannot run on.
322
- metadata: A dictionary of metadata to be added to the task.
341
+ _file_mounts_mapping: (Internal use only) A dictionary of file mounts
342
+ mapping.
343
+ _volume_mounts: (Internal use only) A list of volume mounts.
344
+ _metadata: (Internal use only) A dictionary of metadata to be added to
345
+ the task.
346
+ _user_specified_yaml: (Internal use only) A string of user-specified
347
+ YAML config.
323
348
  """
324
349
  self.name = name
325
350
  self.run = run
@@ -372,11 +397,19 @@ class Task:
372
397
  self.best_resources: Optional[sky.Resources] = None
373
398
 
374
399
  # For internal use only.
375
- self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
400
+ self.file_mounts_mapping: Optional[Dict[str,
401
+ str]] = _file_mounts_mapping
376
402
  self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
377
- volume_mounts)
403
+ _volume_mounts)
404
+
405
+ self._metadata = _metadata if _metadata is not None else {}
378
406
 
379
- self._metadata = metadata if metadata is not None else {}
407
+ if resources is not None:
408
+ self.set_resources(resources)
409
+ if storage_mounts is not None:
410
+ self.set_storage_mounts(storage_mounts)
411
+ if file_mounts is not None:
412
+ self.set_file_mounts(file_mounts)
380
413
 
381
414
  dag = sky.dag.get_current_dag()
382
415
  if dag is not None:
@@ -621,10 +654,10 @@ class Task:
621
654
  num_nodes=config.pop('num_nodes', None),
622
655
  envs=config.pop('envs', None),
623
656
  secrets=config.pop('secrets', None),
624
- event_callback=config.pop('event_callback', None),
625
- file_mounts_mapping=config.pop('file_mounts_mapping', None),
626
657
  volumes=config.pop('volumes', None),
627
- metadata=config.pop('_metadata', None),
658
+ event_callback=config.pop('event_callback', None),
659
+ _file_mounts_mapping=config.pop('file_mounts_mapping', None),
660
+ _metadata=config.pop('_metadata', None),
628
661
  _user_specified_yaml=user_specified_yaml,
629
662
  )
630
663
 
@@ -777,6 +777,10 @@ available_node_types:
777
777
  {{ ray_installation_commands }}
778
778
 
779
779
  VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
780
+ # Apply Ray patches for progress bar fix
781
+ ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
782
+ VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
783
+ }
780
784
  touch /tmp/ray_skypilot_installation_complete
781
785
  echo "=== Ray and skypilot installation completed ==="
782
786
 
@@ -46,6 +46,7 @@ available_node_types:
46
46
  InstanceType: {{instance_type}}
47
47
  ImageId: {{image_id}}
48
48
  DiskSize: {{disk_size}}
49
+ use_spot: {{ use_spot }}
49
50
  network_tier: {{network_tier}}
50
51
  filesystems:
51
52
  {%- for fs in filesystems %}
@@ -209,6 +209,13 @@ class Controllers(enum.Enum):
209
209
  return None
210
210
 
211
211
 
212
+ def get_controller_for_pool(pool: bool) -> Controllers:
213
+ """Get the controller type."""
214
+ if pool:
215
+ return Controllers.JOBS_CONTROLLER
216
+ return Controllers.SKY_SERVE_CONTROLLER
217
+
218
+
212
219
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
213
220
  """Check if the controller high availability is specified in user config.
214
221
  """
sky/utils/rich_utils.py CHANGED
@@ -15,11 +15,13 @@ from sky.utils import message_utils
15
15
  from sky.utils import rich_console_utils
16
16
 
17
17
  if typing.TYPE_CHECKING:
18
+ import aiohttp
18
19
  import requests
19
20
  import rich.console as rich_console
20
21
  else:
21
22
  requests = adaptors_common.LazyImport('requests')
22
23
  rich_console = adaptors_common.LazyImport('rich.console')
24
+ aiohttp = adaptors_common.LazyImport('aiohttp')
23
25
 
24
26
  GeneralStatus = Union['rich_console.Status', 'EncodedStatus']
25
27
 
@@ -398,3 +400,121 @@ def decode_rich_status(
398
400
  finally:
399
401
  if decoding_status is not None:
400
402
  decoding_status.__exit__(None, None, None)
403
+
404
+
405
+ async def decode_rich_status_async(
406
+ response: 'aiohttp.ClientResponse'
407
+ ) -> typing.AsyncIterator[Optional[str]]:
408
+ """Async version of rich_utils.decode_rich_status that decodes rich status
409
+ messages from an aiohttp response.
410
+
411
+ Args:
412
+ response: The aiohttp response.
413
+
414
+ Yields:
415
+ Optional[str]: Decoded lines or None for control messages.
416
+ """
417
+ decoding_status = None
418
+ try:
419
+ last_line = ''
420
+ # Buffer to store incomplete UTF-8 bytes between chunks
421
+ undecoded_buffer = b''
422
+
423
+ # Iterate over the response content in chunks
424
+ async for chunk in response.content.iter_chunked(8192):
425
+ if chunk is None:
426
+ return
427
+
428
+ # Append the new chunk to any leftover bytes from previous iteration
429
+ current_bytes = undecoded_buffer + chunk
430
+ undecoded_buffer = b''
431
+
432
+ # Try to decode the combined bytes
433
+ try:
434
+ encoded_msg = current_bytes.decode('utf-8')
435
+ except UnicodeDecodeError as e:
436
+ # Check if this is potentially an incomplete sequence at the end
437
+ if e.start > 0:
438
+ # Decode the valid part
439
+ encoded_msg = current_bytes[:e.start].decode('utf-8')
440
+
441
+ # Check if the remaining bytes are likely a partial char
442
+ # or actually invalid UTF-8
443
+ remaining_bytes = current_bytes[e.start:]
444
+ if len(remaining_bytes) < 4: # Max UTF-8 char is 4 bytes
445
+ # Likely incomplete - save for next chunk
446
+ undecoded_buffer = remaining_bytes
447
+ else:
448
+ # Likely invalid - replace with replacement character
449
+ encoded_msg += remaining_bytes.decode('utf-8',
450
+ errors='replace')
451
+ undecoded_buffer = b''
452
+ else:
453
+ # Error at the very beginning of the buffer - invalid UTF-8
454
+ encoded_msg = current_bytes.decode('utf-8',
455
+ errors='replace')
456
+ undecoded_buffer = b''
457
+
458
+ lines = encoded_msg.splitlines(keepends=True)
459
+
460
+ # Skip processing if lines is empty to avoid IndexError
461
+ if not lines:
462
+ continue
463
+
464
+ lines[0] = last_line + lines[0]
465
+ last_line = lines[-1]
466
+ # If the last line is not ended with `\r` or `\n` (with ending
467
+ # spaces stripped), it means the last line is not a complete line.
468
+ # We keep the last line in the buffer and continue.
469
+ if (not last_line.strip(' ').endswith('\r') and
470
+ not last_line.strip(' ').endswith('\n')):
471
+ lines = lines[:-1]
472
+ else:
473
+ # Reset the buffer for the next line, as the last line is a
474
+ # complete line.
475
+ last_line = ''
476
+
477
+ for line in lines:
478
+ if line.endswith('\r\n'):
479
+ # Replace `\r\n` with `\n`, as printing a line ends with
480
+ # `\r\n` in linux will cause the line to be empty.
481
+ line = line[:-2] + '\n'
482
+ is_payload, line = message_utils.decode_payload(
483
+ line, raise_for_mismatch=False)
484
+ control = None
485
+ if is_payload:
486
+ control, encoded_status = Control.decode(line)
487
+ if control is None:
488
+ yield line
489
+ continue
490
+
491
+ if control == Control.RETRY:
492
+ raise exceptions.RequestInterruptedError(
493
+ 'Streaming interrupted. Please retry.')
494
+ # control is not None, i.e. it is a rich status control message.
495
+ # In async context, we'll handle rich status controls normally
496
+ # since async typically runs in main thread
497
+ if control == Control.INIT:
498
+ decoding_status = client_status(encoded_status)
499
+ else:
500
+ if decoding_status is None:
501
+ # status may not be initialized if a user use --tail for
502
+ # sky api logs.
503
+ continue
504
+ assert decoding_status is not None, (
505
+ f'Rich status not initialized: {line}')
506
+ if control == Control.UPDATE:
507
+ decoding_status.update(encoded_status)
508
+ elif control == Control.STOP:
509
+ decoding_status.stop()
510
+ elif control == Control.EXIT:
511
+ decoding_status.__exit__(None, None, None)
512
+ elif control == Control.START:
513
+ decoding_status.start()
514
+ elif control == Control.HEARTBEAT:
515
+ # Heartbeat is not displayed to the user, so we do not
516
+ # need to update the status.
517
+ pass
518
+ finally:
519
+ if decoding_status is not None:
520
+ decoding_status.__exit__(None, None, None)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250804
3
+ Version: 1.0.0.dev20250806
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -57,6 +57,7 @@ Requires-Dist: pyjwt
57
57
  Requires-Dist: gitpython
58
58
  Requires-Dist: types-paramiko
59
59
  Requires-Dist: alembic
60
+ Requires-Dist: aiohttp
60
61
  Provides-Extra: aws
61
62
  Requires-Dist: awscli>=1.27.10; extra == "aws"
62
63
  Requires-Dist: botocore>=1.29.10; extra == "aws"
@@ -132,6 +133,7 @@ Requires-Dist: casbin; extra == "server"
132
133
  Requires-Dist: sqlalchemy_adapter; extra == "server"
133
134
  Requires-Dist: passlib; extra == "server"
134
135
  Requires-Dist: pyjwt; extra == "server"
136
+ Requires-Dist: aiohttp; extra == "server"
135
137
  Provides-Extra: all
136
138
  Requires-Dist: awscli>=1.27.10; extra == "all"
137
139
  Requires-Dist: botocore>=1.29.10; extra == "all"
@@ -186,6 +188,7 @@ Requires-Dist: casbin; extra == "all"
186
188
  Requires-Dist: sqlalchemy_adapter; extra == "all"
187
189
  Requires-Dist: passlib; extra == "all"
188
190
  Requires-Dist: pyjwt; extra == "all"
191
+ Requires-Dist: aiohttp; extra == "all"
189
192
  Dynamic: author
190
193
  Dynamic: classifier
191
194
  Dynamic: description
@@ -235,6 +238,7 @@ Dynamic: summary
235
238
  ----
236
239
 
237
240
  :fire: *News* :fire:
241
+ - [Aug 2025] Run and serve **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**example**](./llm/gpt-oss/)
238
242
  - [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
239
243
  - [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
240
244
  - [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)