skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251012__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (44) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +3 -2
  3. sky/client/cli/command.py +53 -4
  4. sky/client/sdk.py +11 -3
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-4f7079dcab6ed653.js → [job]-e5c9ce6a24fc0de4.js} +1 -1
  8. sky/dashboard/out/_next/static/chunks/{webpack-6a5ddd0184bfa22c.js → webpack-66f23594d38c7f16.js} +1 -1
  9. sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → yOfMelBaFp8uL5F9atyAK}/_buildManifest.js +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/mounting_utils.py +54 -15
  26. sky/jobs/server/server.py +2 -2
  27. sky/provision/kubernetes/instance.py +2 -27
  28. sky/provision/kubernetes/utils.py +47 -6
  29. sky/serve/server/server.py +1 -1
  30. sky/server/constants.py +4 -0
  31. sky/server/requests/executor.py +36 -36
  32. sky/server/requests/payloads.py +2 -0
  33. sky/server/requests/requests.py +119 -2
  34. sky/server/server.py +19 -5
  35. sky/server/stream_utils.py +61 -26
  36. sky/utils/common_utils.py +6 -3
  37. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/METADATA +36 -35
  38. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/RECORD +43 -43
  39. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  40. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → yOfMelBaFp8uL5F9atyAK}/_ssgManifest.js +0 -0
  41. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/WHEEL +0 -0
  42. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/entry_points.txt +0 -0
  43. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/licenses/LICENSE +0 -0
  44. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251012.dist-info}/top_level.txt +0 -0
@@ -292,6 +292,100 @@ class Request:
292
292
  raise
293
293
 
294
294
 
295
+ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
296
+ """Serialize the SkyPilot API request for display purposes.
297
+
298
+ This function should be called on the server side to serialize the
299
+ request body into human readable format, e.g., the entrypoint should
300
+ be a string, and the pid, error, or return value are not needed.
301
+
302
+ The returned value will then be displayed on the client side in request
303
+ table.
304
+
305
+ We do not use `encode` for display to avoid a large amount of data being
306
+ sent to the client side, especially for the request table could include
307
+ all the requests.
308
+ """
309
+ encoded_requests = []
310
+ all_users = global_user_state.get_all_users()
311
+ all_users_map = {user.id: user.name for user in all_users}
312
+ for request in requests:
313
+ if request.request_body is not None:
314
+ assert isinstance(request.request_body,
315
+ payloads.RequestBody), (request.name,
316
+ request.request_body)
317
+ user_name = all_users_map.get(request.user_id)
318
+ payload = payloads.RequestPayload(
319
+ request_id=request.request_id,
320
+ name=request.name,
321
+ entrypoint=request.entrypoint.__name__
322
+ if request.entrypoint is not None else '',
323
+ request_body=request.request_body.model_dump_json()
324
+ if request.request_body is not None else json.dumps(None),
325
+ status=request.status.value,
326
+ return_value=json.dumps(None),
327
+ error=json.dumps(None),
328
+ pid=None,
329
+ created_at=request.created_at,
330
+ schedule_type=request.schedule_type.value,
331
+ user_id=request.user_id,
332
+ user_name=user_name,
333
+ cluster_name=request.cluster_name,
334
+ status_msg=request.status_msg,
335
+ should_retry=request.should_retry,
336
+ finished_at=request.finished_at,
337
+ )
338
+ encoded_requests.append(payload)
339
+ return encoded_requests
340
+
341
+
342
+ def _update_request_row_fields(
343
+ row: Tuple[Any, ...],
344
+ fields: Optional[List[str]] = None) -> Tuple[Any, ...]:
345
+ """Update the request row fields."""
346
+ if not fields:
347
+ return row
348
+
349
+ # Convert tuple to dictionary for easier manipulation
350
+ content = dict(zip(fields, row))
351
+
352
+ # Required fields in RequestPayload
353
+ if 'request_id' not in fields:
354
+ content['request_id'] = ''
355
+ if 'name' not in fields:
356
+ content['name'] = ''
357
+ if 'entrypoint' not in fields:
358
+ content['entrypoint'] = server_constants.EMPTY_PICKLED_VALUE
359
+ if 'request_body' not in fields:
360
+ content['request_body'] = server_constants.EMPTY_PICKLED_VALUE
361
+ if 'status' not in fields:
362
+ content['status'] = RequestStatus.PENDING.value
363
+ if 'created_at' not in fields:
364
+ content['created_at'] = 0
365
+ if 'user_id' not in fields:
366
+ content['user_id'] = ''
367
+ if 'return_value' not in fields:
368
+ content['return_value'] = json.dumps(None)
369
+ if 'error' not in fields:
370
+ content['error'] = json.dumps(None)
371
+ if 'schedule_type' not in fields:
372
+ content['schedule_type'] = ScheduleType.SHORT.value
373
+ # Optional fields in RequestPayload
374
+ if 'pid' not in fields:
375
+ content['pid'] = None
376
+ if 'cluster_name' not in fields:
377
+ content['cluster_name'] = None
378
+ if 'status_msg' not in fields:
379
+ content['status_msg'] = None
380
+ if 'should_retry' not in fields:
381
+ content['should_retry'] = False
382
+ if 'finished_at' not in fields:
383
+ content['finished_at'] = None
384
+
385
+ # Convert back to tuple in the same order as REQUEST_COLUMNS
386
+ return tuple(content[col] for col in REQUEST_COLUMNS)
387
+
388
+
295
389
  def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
296
390
  """Kill all pending and running requests for a cluster.
297
391
 
@@ -634,6 +728,7 @@ class RequestTaskFilter:
634
728
  Mutually exclusive with exclude_request_names.
635
729
  finished_before: if provided, only include requests finished before this
636
730
  timestamp.
731
+ limit: the number of requests to show. If None, show all requests.
637
732
 
638
733
  Raises:
639
734
  ValueError: If both exclude_request_names and include_request_names are
@@ -645,6 +740,8 @@ class RequestTaskFilter:
645
740
  exclude_request_names: Optional[List[str]] = None
646
741
  include_request_names: Optional[List[str]] = None
647
742
  finished_before: Optional[float] = None
743
+ limit: Optional[int] = None
744
+ fields: Optional[List[str]] = None
648
745
 
649
746
  def __post_init__(self):
650
747
  if (self.exclude_request_names is not None and
@@ -687,8 +784,13 @@ class RequestTaskFilter:
687
784
  if filter_str:
688
785
  filter_str = f' WHERE {filter_str}'
689
786
  columns_str = ', '.join(REQUEST_COLUMNS)
690
- return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
691
- 'ORDER BY created_at DESC'), filter_params
787
+ if self.fields:
788
+ columns_str = ', '.join(self.fields)
789
+ query_str = (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
790
+ 'ORDER BY created_at DESC')
791
+ if self.limit is not None:
792
+ query_str += f' LIMIT {self.limit}'
793
+ return query_str, filter_params
692
794
 
693
795
 
694
796
  @init_db
@@ -722,6 +824,21 @@ async def get_request_tasks_async(
722
824
  return [Request.from_row(row) for row in rows]
723
825
 
724
826
 
827
+ @init_db_async
828
+ @metrics_lib.time_me_async
829
+ async def get_request_tasks_with_fields_async(
830
+ req_filter: RequestTaskFilter,
831
+ fields: Optional[List[str]] = None,
832
+ ) -> List[Request]:
833
+ """Async version of get_request_tasks."""
834
+ assert _DB is not None
835
+ async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
836
+ if not rows:
837
+ return []
838
+ rows = [_update_request_row_fields(row, fields) for row in rows]
839
+ return [Request.from_row(row) for row in rows]
840
+
841
+
725
842
  @init_db_async
726
843
  @metrics_lib.time_me_async
727
844
  async def get_api_request_ids_start_with(incomplete: str) -> List[str]:
sky/server/server.py CHANGED
@@ -1243,7 +1243,7 @@ async def logs(
1243
1243
  background_tasks.add_task(task.cancel)
1244
1244
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1245
1245
  # the same approach as /stream.
1246
- return stream_utils.stream_response(
1246
+ return stream_utils.stream_response_for_long_request(
1247
1247
  request_id=request.state.request_id,
1248
1248
  logs_path=request_task.log_path,
1249
1249
  background_tasks=background_tasks,
@@ -1539,6 +1539,7 @@ async def stream(
1539
1539
  'X-Accel-Buffering': 'no'
1540
1540
  })
1541
1541
 
1542
+ polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1542
1543
  # Original plain text streaming logic
1543
1544
  if request_id is not None:
1544
1545
  request_task = await requests_lib.get_request_async(request_id)
@@ -1553,6 +1554,8 @@ async def stream(
1553
1554
  raise fastapi.HTTPException(
1554
1555
  status_code=404,
1555
1556
  detail=f'Log of request {request_id!r} has been deleted')
1557
+ if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1558
+ polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1556
1559
  else:
1557
1560
  assert log_path is not None, (request_id, log_path)
1558
1561
  if log_path == constants.API_SERVER_LOGS:
@@ -1600,7 +1603,8 @@ async def stream(
1600
1603
  log_path_to_stream,
1601
1604
  plain_logs=format == 'plain',
1602
1605
  tail=tail,
1603
- follow=follow),
1606
+ follow=follow,
1607
+ polling_interval=polling_interval),
1604
1608
  media_type='text/plain',
1605
1609
  headers=headers,
1606
1610
  )
@@ -1625,6 +1629,10 @@ async def api_status(
1625
1629
  None, description='Request IDs to get status for.'),
1626
1630
  all_status: bool = fastapi.Query(
1627
1631
  False, description='Get finished requests as well.'),
1632
+ limit: Optional[int] = fastapi.Query(
1633
+ None, description='Number of requests to show.'),
1634
+ fields: Optional[List[str]] = fastapi.Query(
1635
+ None, description='Fields to get. If None, get all fields.'),
1628
1636
  ) -> List[payloads.RequestPayload]:
1629
1637
  """Gets the list of requests."""
1630
1638
  if request_ids is None:
@@ -1634,9 +1642,15 @@ async def api_status(
1634
1642
  requests_lib.RequestStatus.PENDING,
1635
1643
  requests_lib.RequestStatus.RUNNING,
1636
1644
  ]
1637
- request_tasks = await requests_lib.get_request_tasks_async(
1638
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
1639
- return [r.readable_encode() for r in request_tasks]
1645
+ request_tasks = await requests_lib.get_request_tasks_with_fields_async(
1646
+ req_filter=requests_lib.RequestTaskFilter(
1647
+ status=statuses,
1648
+ limit=limit,
1649
+ fields=fields,
1650
+ ),
1651
+ fields=fields,
1652
+ )
1653
+ return requests_lib.encode_requests(request_tasks)
1640
1654
  else:
1641
1655
  encoded_request_tasks = []
1642
1656
  for request_id in request_ids:
@@ -11,6 +11,7 @@ import fastapi
11
11
  from sky import global_user_state
12
12
  from sky import sky_logging
13
13
  from sky.server.requests import requests as requests_lib
14
+ from sky.utils import common_utils
14
15
  from sky.utils import message_utils
15
16
  from sky.utils import rich_utils
16
17
  from sky.utils import status_lib
@@ -24,7 +25,9 @@ logger = sky_logging.init_logger(__name__)
24
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
25
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
26
27
  _HEARTBEAT_INTERVAL = 30
27
- _CLUSTER_STATUS_INTERVAL = 1
28
+
29
+ LONG_REQUEST_POLL_INTERVAL = 1
30
+ DEFAULT_POLL_INTERVAL = 0.1
28
31
 
29
32
 
30
33
  async def _yield_log_file_with_payloads_skipped(
@@ -41,12 +44,14 @@ async def _yield_log_file_with_payloads_skipped(
41
44
 
42
45
 
43
46
  async def log_streamer(
44
- request_id: Optional[str],
45
- log_path: pathlib.Path,
46
- plain_logs: bool = False,
47
- tail: Optional[int] = None,
48
- follow: bool = True,
49
- cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
47
+ request_id: Optional[str],
48
+ log_path: pathlib.Path,
49
+ plain_logs: bool = False,
50
+ tail: Optional[int] = None,
51
+ follow: bool = True,
52
+ cluster_name: Optional[str] = None,
53
+ polling_interval: float = DEFAULT_POLL_INTERVAL
54
+ ) -> AsyncGenerator[str, None]:
50
55
  """Streams the logs of a request.
51
56
 
52
57
  Args:
@@ -84,6 +89,11 @@ async def log_streamer(
84
89
  f'scheduled: {request_id}')
85
90
  req_status = request_task.status
86
91
  req_msg = request_task.status_msg
92
+ # Slowly back off the database polling up to every 1 second, to avoid
93
+ # overloading the CPU and DB.
94
+ backoff = common_utils.Backoff(initial_backoff=polling_interval,
95
+ max_backoff_factor=10,
96
+ multiplier=1.2)
87
97
  while req_status < requests_lib.RequestStatus.RUNNING:
88
98
  if req_msg is not None:
89
99
  waiting_msg = request_task.status_msg
@@ -99,7 +109,7 @@ async def log_streamer(
99
109
  # TODO(aylei): we should use a better mechanism to avoid busy
100
110
  # polling the DB, which can be a bottleneck for high-concurrency
101
111
  # requests.
102
- await asyncio.sleep(0.1)
112
+ await asyncio.sleep(backoff.current_backoff())
103
113
  status_with_msg = await requests_lib.get_request_status_async(
104
114
  request_id, include_msg=True)
105
115
  req_status = status_with_msg.status
@@ -111,17 +121,20 @@ async def log_streamer(
111
121
 
112
122
  async with aiofiles.open(log_path, 'rb') as f:
113
123
  async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
114
- follow, cluster_name):
124
+ follow, cluster_name,
125
+ polling_interval):
115
126
  yield chunk
116
127
 
117
128
 
118
129
  async def _tail_log_file(
119
- f: aiofiles.threadpool.binary.AsyncBufferedReader,
120
- request_id: Optional[str] = None,
121
- plain_logs: bool = False,
122
- tail: Optional[int] = None,
123
- follow: bool = True,
124
- cluster_name: Optional[str] = None) -> AsyncGenerator[str, None]:
130
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
131
+ request_id: Optional[str] = None,
132
+ plain_logs: bool = False,
133
+ tail: Optional[int] = None,
134
+ follow: bool = True,
135
+ cluster_name: Optional[str] = None,
136
+ polling_interval: float = DEFAULT_POLL_INTERVAL
137
+ ) -> AsyncGenerator[str, None]:
125
138
  """Tail the opened log file, buffer the lines and flush in chunks."""
126
139
 
127
140
  if tail is not None:
@@ -137,7 +150,7 @@ async def _tail_log_file(
137
150
  yield line_str
138
151
 
139
152
  last_heartbeat_time = asyncio.get_event_loop().time()
140
- last_cluster_status_check_time = asyncio.get_event_loop().time()
153
+ last_status_check_time = asyncio.get_event_loop().time()
141
154
 
142
155
  # Buffer the lines in memory and flush them in chunks to improve log
143
156
  # tailing throughput.
@@ -167,7 +180,17 @@ async def _tail_log_file(
167
180
 
168
181
  line: Optional[bytes] = await f.readline()
169
182
  if not line:
170
- if request_id is not None:
183
+ # Avoid checking the status too frequently to avoid overloading the
184
+ # DB.
185
+ should_check_status = (current_time -
186
+ last_status_check_time) >= polling_interval
187
+ if not follow:
188
+ # We will only hit this path once, but we should make sure to
189
+ # check the status so that we display the final request status
190
+ # if the request is complete.
191
+ should_check_status = True
192
+ if request_id is not None and should_check_status:
193
+ last_status_check_time = current_time
171
194
  req_status = await requests_lib.get_request_status_async(
172
195
  request_id)
173
196
  if req_status.status > requests_lib.RequestStatus.RUNNING:
@@ -185,20 +208,19 @@ async def _tail_log_file(
185
208
  ' cancelled\n')
186
209
  break
187
210
  if not follow:
211
+ # The below checks (cluster status, heartbeat) are not needed
212
+ # for non-follow logs.
188
213
  break
189
214
  # Provision logs pass in cluster_name, check cluster status
190
- # periodically to see if provisioning is done. We only
191
- # check once a second to avoid overloading the DB.
192
- check_status = (current_time - last_cluster_status_check_time
193
- ) >= _CLUSTER_STATUS_INTERVAL
194
- if cluster_name is not None and check_status:
215
+ # periodically to see if provisioning is done.
216
+ if cluster_name is not None and should_check_status:
217
+ last_status_check_time = current_time
195
218
  cluster_record = await (
196
219
  global_user_state.get_status_from_cluster_name_async(
197
220
  cluster_name))
198
221
  if (cluster_record is None or
199
222
  cluster_record != status_lib.ClusterStatus.INIT):
200
223
  break
201
- last_cluster_status_check_time = current_time
202
224
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
203
225
  # Currently just used to keep the connection busy, refer to
204
226
  # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -234,9 +256,22 @@ async def _tail_log_file(
234
256
  yield chunk
235
257
 
236
258
 
259
+ def stream_response_for_long_request(
260
+ request_id: str,
261
+ logs_path: pathlib.Path,
262
+ background_tasks: fastapi.BackgroundTasks,
263
+ ) -> fastapi.responses.StreamingResponse:
264
+ return stream_response(request_id,
265
+ logs_path,
266
+ background_tasks,
267
+ polling_interval=LONG_REQUEST_POLL_INTERVAL)
268
+
269
+
237
270
  def stream_response(
238
- request_id: str, logs_path: pathlib.Path,
239
- background_tasks: fastapi.BackgroundTasks
271
+ request_id: str,
272
+ logs_path: pathlib.Path,
273
+ background_tasks: fastapi.BackgroundTasks,
274
+ polling_interval: float = DEFAULT_POLL_INTERVAL
240
275
  ) -> fastapi.responses.StreamingResponse:
241
276
 
242
277
  async def on_disconnect():
@@ -249,7 +284,7 @@ def stream_response(
249
284
  background_tasks.add_task(on_disconnect)
250
285
 
251
286
  return fastapi.responses.StreamingResponse(
252
- log_streamer(request_id, logs_path),
287
+ log_streamer(request_id, logs_path, polling_interval=polling_interval),
253
288
  media_type='text/plain',
254
289
  headers={
255
290
  'Cache-Control': 'no-cache, no-transform',
sky/utils/common_utils.py CHANGED
@@ -265,13 +265,16 @@ def get_global_job_id(job_timestamp: str,
265
265
 
266
266
  class Backoff:
267
267
  """Exponential backoff with jittering."""
268
- MULTIPLIER = 1.6
269
268
  JITTER = 0.4
270
269
 
271
- def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5):
270
+ def __init__(self,
271
+ initial_backoff: float = 5,
272
+ max_backoff_factor: int = 5,
273
+ multiplier: float = 1.6):
272
274
  self._initial = True
273
275
  self._backoff = 0.0
274
276
  self._initial_backoff = initial_backoff
277
+ self._multiplier = multiplier
275
278
  self._max_backoff = max_backoff_factor * self._initial_backoff
276
279
 
277
280
  # https://github.com/grpc/grpc/blob/2d4f3c56001cd1e1f85734b2f7c5ce5f2797c38a/doc/connection-backoff.md
@@ -283,7 +286,7 @@ class Backoff:
283
286
  self._initial = False
284
287
  self._backoff = min(self._initial_backoff, self._max_backoff)
285
288
  else:
286
- self._backoff = min(self._backoff * self.MULTIPLIER,
289
+ self._backoff = min(self._backoff * self._multiplier,
287
290
  self._max_backoff)
288
291
  self._backoff += random.uniform(-self.JITTER * self._backoff,
289
292
  self.JITTER * self._backoff)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20251009
3
+ Version: 1.0.0.dev20251012
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -155,51 +155,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
155
155
  Requires-Dist: aiosqlite; extra == "server"
156
156
  Requires-Dist: greenlet; extra == "server"
157
157
  Provides-Extra: all
158
+ Requires-Dist: anyio; extra == "all"
158
159
  Requires-Dist: nebius>=0.2.47; extra == "all"
159
- Requires-Dist: websockets; extra == "all"
160
+ Requires-Dist: ecsapi>=0.2.0; extra == "all"
161
+ Requires-Dist: ibm-cos-sdk; extra == "all"
162
+ Requires-Dist: python-dateutil; extra == "all"
160
163
  Requires-Dist: azure-core>=1.31.0; extra == "all"
161
- Requires-Dist: msgraph-sdk; extra == "all"
162
- Requires-Dist: ibm-cloud-sdk-core; extra == "all"
163
- Requires-Dist: google-cloud-storage; extra == "all"
164
+ Requires-Dist: aiosqlite; extra == "all"
165
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
166
+ Requires-Dist: cudo-compute>=0.1.10; extra == "all"
167
+ Requires-Dist: pydo>=0.3.0; extra == "all"
164
168
  Requires-Dist: casbin; extra == "all"
165
- Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
166
- Requires-Dist: ecsapi>=0.2.0; extra == "all"
167
- Requires-Dist: awscli>=1.27.10; extra == "all"
169
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
170
+ Requires-Dist: boto3>=1.26.1; extra == "all"
168
171
  Requires-Dist: sqlalchemy_adapter; extra == "all"
169
- Requires-Dist: msrestazure; extra == "all"
170
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
171
- Requires-Dist: ibm-vpc; extra == "all"
172
- Requires-Dist: ray[default]>=2.6.1; extra == "all"
173
- Requires-Dist: anyio; extra == "all"
174
- Requires-Dist: runpod>=1.6.1; extra == "all"
175
- Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
172
+ Requires-Dist: passlib; extra == "all"
176
173
  Requires-Dist: greenlet; extra == "all"
177
- Requires-Dist: ibm-cos-sdk; extra == "all"
178
- Requires-Dist: docker; extra == "all"
174
+ Requires-Dist: msrestazure; extra == "all"
175
+ Requires-Dist: colorama<0.4.5; extra == "all"
179
176
  Requires-Dist: azure-common; extra == "all"
180
- Requires-Dist: python-dateutil; extra == "all"
177
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
178
+ Requires-Dist: websockets; extra == "all"
181
179
  Requires-Dist: tomli; python_version < "3.11" and extra == "all"
182
- Requires-Dist: pyjwt; extra == "all"
183
- Requires-Dist: botocore>=1.29.10; extra == "all"
184
- Requires-Dist: azure-core>=1.24.0; extra == "all"
185
- Requires-Dist: colorama<0.4.5; extra == "all"
186
- Requires-Dist: passlib; extra == "all"
180
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
187
181
  Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
188
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
182
+ Requires-Dist: google-cloud-storage; extra == "all"
183
+ Requires-Dist: docker; extra == "all"
184
+ Requires-Dist: grpcio>=1.63.0; extra == "all"
185
+ Requires-Dist: msgraph-sdk; extra == "all"
186
+ Requires-Dist: ibm-vpc; extra == "all"
189
187
  Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
190
- Requires-Dist: aiosqlite; extra == "all"
191
- Requires-Dist: aiohttp; extra == "all"
192
- Requires-Dist: cudo-compute>=0.1.10; extra == "all"
188
+ Requires-Dist: pyjwt; extra == "all"
193
189
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
194
- Requires-Dist: boto3>=1.26.1; extra == "all"
195
- Requires-Dist: pydo>=0.3.0; extra == "all"
190
+ Requires-Dist: botocore>=1.29.10; extra == "all"
196
191
  Requires-Dist: azure-cli>=2.65.0; extra == "all"
192
+ Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
193
+ Requires-Dist: oci; extra == "all"
194
+ Requires-Dist: awscli>=1.27.10; extra == "all"
195
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
196
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
197
197
  Requires-Dist: azure-identity>=1.19.0; extra == "all"
198
+ Requires-Dist: runpod>=1.6.1; extra == "all"
199
+ Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
200
+ Requires-Dist: ibm-cloud-sdk-core; extra == "all"
198
201
  Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
199
- Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
200
- Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
201
- Requires-Dist: grpcio>=1.63.0; extra == "all"
202
- Requires-Dist: oci; extra == "all"
202
+ Requires-Dist: aiohttp; extra == "all"
203
203
  Dynamic: author
204
204
  Dynamic: classifier
205
205
  Dynamic: description
@@ -249,10 +249,11 @@ Dynamic: summary
249
249
  ----
250
250
 
251
251
  :fire: *News* :fire:
252
+ - [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./llm/torchtitan/)
253
+ - [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
254
+ - [Sep 2025] Network and Storage Benchmarks for LLM training on the cloud: [**blog**](https://maknee.github.io/blog/2025/Network-And-Storage-Training-Skypilot/)
252
255
  - [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)
253
- - [Jul 2025] Run large-scale **LLM training with TorchTitan** on any cloud: [**example**](./llm/torchtitan/)
254
256
  - [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
255
- - [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
256
257
  - [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
257
258
  - [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
258
259
  - [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)