skypilot-nightly 1.0.0.dev20251026__py3-none-any.whl → 1.0.0.dev20251029__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (82) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/coreweave.py +278 -0
  3. sky/backends/backend_utils.py +9 -6
  4. sky/backends/cloud_vm_ray_backend.py +2 -3
  5. sky/check.py +25 -13
  6. sky/client/cli/command.py +34 -15
  7. sky/client/sdk.py +4 -4
  8. sky/cloud_stores.py +73 -0
  9. sky/core.py +7 -5
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{wDQ7aGvICzMNmjIaC37TT → DabuSAKsc_y0wyJxpTIdQ}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +26 -0
  14. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/{webpack-4abaae354da0ba13.js → webpack-485984ca04e021d0.js} +1 -1
  27. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  28. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  29. sky/dashboard/out/clusters/[cluster].html +1 -1
  30. sky/dashboard/out/clusters.html +1 -1
  31. sky/dashboard/out/config.html +1 -1
  32. sky/dashboard/out/index.html +1 -1
  33. sky/dashboard/out/infra/[context].html +1 -1
  34. sky/dashboard/out/infra.html +1 -1
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  37. sky/dashboard/out/jobs.html +1 -1
  38. sky/dashboard/out/users.html +1 -1
  39. sky/dashboard/out/volumes.html +1 -1
  40. sky/dashboard/out/workspace/new.html +1 -1
  41. sky/dashboard/out/workspaces/[name].html +1 -1
  42. sky/dashboard/out/workspaces.html +1 -1
  43. sky/data/data_utils.py +92 -1
  44. sky/data/mounting_utils.py +39 -0
  45. sky/data/storage.py +166 -9
  46. sky/global_user_state.py +59 -83
  47. sky/jobs/server/server.py +2 -2
  48. sky/jobs/utils.py +5 -6
  49. sky/optimizer.py +1 -1
  50. sky/provision/kubernetes/instance.py +88 -19
  51. sky/provision/kubernetes/volume.py +2 -2
  52. sky/schemas/api/responses.py +2 -5
  53. sky/serve/replica_managers.py +2 -2
  54. sky/serve/serve_utils.py +9 -2
  55. sky/server/requests/payloads.py +2 -0
  56. sky/server/requests/requests.py +182 -84
  57. sky/server/requests/serializers/decoders.py +3 -3
  58. sky/server/requests/serializers/encoders.py +33 -6
  59. sky/server/server.py +34 -7
  60. sky/server/stream_utils.py +56 -13
  61. sky/setup_files/dependencies.py +2 -0
  62. sky/task.py +10 -0
  63. sky/templates/nebius-ray.yml.j2 +1 -0
  64. sky/utils/cli_utils/status_utils.py +8 -2
  65. sky/utils/context_utils.py +13 -1
  66. sky/utils/resources_utils.py +53 -29
  67. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/METADATA +50 -34
  68. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/RECORD +74 -73
  69. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  70. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  74. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  75. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  76. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  77. /sky/dashboard/out/_next/static/{wDQ7aGvICzMNmjIaC37TT → DabuSAKsc_y0wyJxpTIdQ}/_ssgManifest.js +0 -0
  78. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  79. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/WHEEL +0 -0
  80. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/entry_points.txt +0 -0
  81. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/licenses/LICENSE +0 -0
  82. {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/top_level.txt +0 -0
@@ -60,13 +60,23 @@ def encode_status(
60
60
  clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
61
61
  response = []
62
62
  for cluster in clusters:
63
- response_cluster = cluster.model_dump()
63
+ response_cluster = cluster.model_dump(exclude_none=True)
64
+ # These default setting is needed because last_use and status_updated_at
65
+ # used to be not optional.
66
+ # TODO(syang): remove this after v0.10.7 or v0.11.0
67
+ if 'last_use' not in response_cluster:
68
+ response_cluster['last_use'] = ''
69
+ if 'status_updated_at' not in response_cluster:
70
+ response_cluster['status_updated_at'] = 0
64
71
  response_cluster['status'] = cluster['status'].value
65
72
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
73
  cluster['handle'])
67
74
  response_cluster['handle'] = pickle_and_encode(handle)
75
+ # TODO (syang) We still need to return this field for backwards
76
+ # compatibility.
77
+ # Remove this field at or after v0.10.7 or v0.11.0
68
78
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
69
- response_cluster['storage_mounts_metadata'])
79
+ None) # Always returns None.
70
80
  response.append(response_cluster)
71
81
  return response
72
82
 
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
206
216
  @register_encoder('storage_ls')
207
217
  def encode_storage_ls(
208
218
  return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
209
- for storage_info in return_value:
219
+ response_list = [storage_info.model_dump() for storage_info in return_value]
220
+ for storage_info in response_list:
210
221
  storage_info['status'] = storage_info['status'].value
211
222
  storage_info['store'] = [store.value for store in storage_info['store']]
212
- return [storage_info.model_dump() for storage_info in return_value]
223
+ return response_list
213
224
 
214
225
 
215
226
  @register_encoder('volume_list')
@@ -219,11 +230,11 @@ def encode_volume_list(
219
230
 
220
231
 
221
232
  @register_encoder('job_status')
222
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
233
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
223
234
  for job_id in return_value.keys():
224
235
  if return_value[job_id] is not None:
225
236
  return_value[job_id] = return_value[job_id].value
226
- return return_value
237
+ return {str(k): v for k, v in return_value.items()}
227
238
 
228
239
 
229
240
  @register_encoder('kubernetes_node_info')
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
235
246
  @register_encoder('endpoints')
236
247
  def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
237
248
  return {str(k): v for k, v in return_value.items()}
249
+
250
+
251
+ @register_encoder('realtime_kubernetes_gpu_availability')
252
+ def encode_realtime_gpu_availability(
253
+ return_value: List[Tuple[str,
254
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
255
+ # Convert RealtimeGpuAvailability namedtuples to lists
256
+ # for JSON serialization.
257
+ result = []
258
+ for context, gpu_list in return_value:
259
+ gpu_availability_list = []
260
+ for gpu in gpu_list:
261
+ gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
262
+ gpu_availability_list.append(gpu_list_item)
263
+ result.append((context, gpu_availability_list))
264
+ return result
sky/server/server.py CHANGED
@@ -25,6 +25,7 @@ import zipfile
25
25
  import aiofiles
26
26
  import anyio
27
27
  import fastapi
28
+ from fastapi import responses as fastapi_responses
28
29
  from fastapi.middleware import cors
29
30
  import starlette.middleware.base
30
31
  import uvloop
@@ -1497,10 +1498,27 @@ async def local_down(request: fastapi.Request,
1497
1498
  )
1498
1499
 
1499
1500
 
1501
+ async def get_expanded_request_id(request_id: str) -> str:
1502
+ """Gets the expanded request ID for a given request ID prefix."""
1503
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1504
+ request_id, fields=['request_id'])
1505
+ if request_tasks is None:
1506
+ raise fastapi.HTTPException(status_code=404,
1507
+ detail=f'Request {request_id!r} not found')
1508
+ if len(request_tasks) > 1:
1509
+ raise fastapi.HTTPException(status_code=400,
1510
+ detail=('Multiple requests found for '
1511
+ f'request ID prefix: {request_id}'))
1512
+ return request_tasks[0].request_id
1513
+
1514
+
1500
1515
  # === API server related APIs ===
1501
- @app.get('/api/get')
1516
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1502
1517
  async def api_get(request_id: str) -> payloads.RequestPayload:
1503
1518
  """Gets a request with a given request ID prefix."""
1519
+ # Validate request_id prefix matches a single request.
1520
+ request_id = await get_expanded_request_id(request_id)
1521
+
1504
1522
  while True:
1505
1523
  req_status = await requests_lib.get_request_status_async(request_id)
1506
1524
  if req_status is None:
@@ -1560,11 +1578,16 @@ async def stream(
1560
1578
  clients, console for CLI/API clients), 'plain' (force plain text),
1561
1579
  'html' (force HTML), or 'console' (force console)
1562
1580
  """
1581
+ # We need to save the user-supplied request ID for the response header.
1582
+ user_supplied_request_id = request_id
1563
1583
  if request_id is not None and log_path is not None:
1564
1584
  raise fastapi.HTTPException(
1565
1585
  status_code=400,
1566
1586
  detail='Only one of request_id and log_path can be provided')
1567
1587
 
1588
+ if request_id is not None:
1589
+ request_id = await get_expanded_request_id(request_id)
1590
+
1568
1591
  if request_id is None and log_path is None:
1569
1592
  request_id = await requests_lib.get_latest_request_id_async()
1570
1593
  if request_id is None:
@@ -1654,7 +1677,9 @@ async def stream(
1654
1677
  'Transfer-Encoding': 'chunked'
1655
1678
  }
1656
1679
  if request_id is not None:
1657
- headers[server_constants.STREAM_REQUEST_HEADER] = request_id
1680
+ headers[server_constants.STREAM_REQUEST_HEADER] = (
1681
+ user_supplied_request_id
1682
+ if user_supplied_request_id else request_id)
1658
1683
 
1659
1684
  return fastapi.responses.StreamingResponse(
1660
1685
  content=stream_utils.log_streamer(request_id,
@@ -1676,7 +1701,7 @@ async def api_cancel(request: fastapi.Request,
1676
1701
  request_id=request.state.request_id,
1677
1702
  request_name='api_cancel',
1678
1703
  request_body=request_cancel_body,
1679
- func=requests_lib.kill_requests,
1704
+ func=requests_lib.kill_requests_with_prefix,
1680
1705
  schedule_type=requests_lib.ScheduleType.SHORT,
1681
1706
  )
1682
1707
 
@@ -1684,7 +1709,7 @@ async def api_cancel(request: fastapi.Request,
1684
1709
  @app.get('/api/status')
1685
1710
  async def api_status(
1686
1711
  request_ids: Optional[List[str]] = fastapi.Query(
1687
- None, description='Request IDs to get status for.'),
1712
+ None, description='Request ID prefixes to get status for.'),
1688
1713
  all_status: bool = fastapi.Query(
1689
1714
  False, description='Get finished requests as well.'),
1690
1715
  limit: Optional[int] = fastapi.Query(
@@ -1711,10 +1736,12 @@ async def api_status(
1711
1736
  else:
1712
1737
  encoded_request_tasks = []
1713
1738
  for request_id in request_ids:
1714
- request_task = await requests_lib.get_request_async(request_id)
1715
- if request_task is None:
1739
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1740
+ request_id)
1741
+ if request_tasks is None:
1716
1742
  continue
1717
- encoded_request_tasks.append(request_task.readable_encode())
1743
+ for request_task in request_tasks:
1744
+ encoded_request_tasks.append(request_task.readable_encode())
1718
1745
  return encoded_request_tasks
1719
1746
 
1720
1747
 
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
25
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
26
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
27
27
  _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
28
30
  # If a SHORT request has been stuck in pending for
29
31
  # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
30
32
  _SHORT_REQUEST_SPINNER_TIMEOUT = 2
@@ -235,6 +237,9 @@ async def _tail_log_file(
235
237
  buffer_bytes = 0
236
238
  last_flush_time = asyncio.get_event_loop().time()
237
239
 
240
+ # Read file in chunks instead of line-by-line for better performance
241
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
242
+
238
243
  async def flush_buffer() -> AsyncGenerator[str, None]:
239
244
  nonlocal buffer, buffer_bytes, last_flush_time
240
245
  if buffer:
@@ -255,8 +260,23 @@ async def _tail_log_file(
255
260
  async for chunk in flush_buffer():
256
261
  yield chunk
257
262
 
258
- line: Optional[bytes] = await f.readline()
259
- if not line:
263
+ # Read file in chunks for better I/O performance
264
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
265
+ if not file_chunk:
266
+ # Process any remaining incomplete line
267
+ if incomplete_line:
268
+ line_str = incomplete_line.decode('utf-8')
269
+ if plain_logs:
270
+ is_payload, line_str = message_utils.decode_payload(
271
+ line_str, raise_for_mismatch=False)
272
+ if not is_payload:
273
+ buffer.append(line_str)
274
+ buffer_bytes += len(line_str.encode('utf-8'))
275
+ else:
276
+ buffer.append(line_str)
277
+ buffer_bytes += len(line_str.encode('utf-8'))
278
+ incomplete_line = b''
279
+
260
280
  # Avoid checking the status too frequently to avoid overloading the
261
281
  # DB.
262
282
  should_check_status = (current_time -
@@ -328,16 +348,39 @@ async def _tail_log_file(
328
348
  # performance but it helps avoid unnecessary heartbeat strings
329
349
  # being printed when the client runs in an old version.
330
350
  last_heartbeat_time = asyncio.get_event_loop().time()
331
- line_str = line.decode('utf-8')
332
- if plain_logs:
333
- is_payload, line_str = message_utils.decode_payload(
334
- line_str, raise_for_mismatch=False)
335
- # TODO(aylei): implement heartbeat mechanism for plain logs,
336
- # sending invisible characters might be okay.
337
- if is_payload:
338
- continue
339
- buffer.append(line_str)
340
- buffer_bytes += len(line_str.encode('utf-8'))
351
+
352
+ # Combine with any incomplete line from previous chunk
353
+ file_chunk = incomplete_line + file_chunk
354
+ incomplete_line = b''
355
+
356
+ # Split chunk into lines, preserving line structure
357
+ lines_bytes = file_chunk.split(b'\n')
358
+
359
+ # If chunk doesn't end with newline, the last element is incomplete
360
+ if file_chunk and not file_chunk.endswith(b'\n'):
361
+ incomplete_line = lines_bytes[-1]
362
+ lines_bytes = lines_bytes[:-1]
363
+ else:
364
+ # If ends with \n, split creates an empty last element we should
365
+ # ignore
366
+ if lines_bytes and lines_bytes[-1] == b'':
367
+ lines_bytes = lines_bytes[:-1]
368
+
369
+ # Process all complete lines in this chunk
370
+ for line_bytes in lines_bytes:
371
+ # Reconstruct line with newline (since split removed it)
372
+ line_str = line_bytes.decode('utf-8') + '\n'
373
+
374
+ if plain_logs:
375
+ is_payload, line_str = message_utils.decode_payload(
376
+ line_str, raise_for_mismatch=False)
377
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
378
+ # sending invisible characters might be okay.
379
+ if is_payload:
380
+ continue
381
+
382
+ buffer.append(line_str)
383
+ buffer_bytes += len(line_str.encode('utf-8'))
341
384
 
342
385
  # Flush remaining lines in the buffer.
343
386
  async for chunk in flush_buffer():
@@ -373,7 +416,7 @@ def stream_response(
373
416
  async def on_disconnect():
374
417
  logger.info(f'User terminated the connection for request '
375
418
  f'{request_id}')
376
- requests_lib.kill_requests([request_id])
419
+ await requests_lib.kill_request_async(request_id)
377
420
 
378
421
  # The background task will be run after returning a response.
379
422
  # https://fastapi.tiangolo.com/tutorial/background-tasks/
@@ -49,6 +49,7 @@ install_requires = [
49
49
  # <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
50
50
  'pyyaml > 3.13, != 5.4.*',
51
51
  'ijson',
52
+ 'orjson',
52
53
  'requests',
53
54
  # SkyPilot inherits from uvicorn.Server to customize the behavior of
54
55
  # uvicorn, so we need to pin uvicorn version to avoid potential break
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
187
188
  'docker': ['docker'] + local_ray,
188
189
  'lambda': [], # No dependencies needed for lambda
189
190
  'cloudflare': aws_dependencies,
191
+ 'coreweave': aws_dependencies,
190
192
  'scp': local_ray,
191
193
  'oci': ['oci'],
192
194
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
sky/task.py CHANGED
@@ -1552,6 +1552,16 @@ class Task:
1552
1552
  self.update_file_mounts({
1553
1553
  mnt_path: blob_path,
1554
1554
  })
1555
+ elif store_type is storage_lib.StoreType.COREWEAVE:
1556
+ if storage.source is not None and not isinstance(
1557
+ storage.source,
1558
+ list) and storage.source.startswith('cw://'):
1559
+ blob_path = storage.source
1560
+ else:
1561
+ blob_path = 'cw://' + storage.name
1562
+ self.update_file_mounts({
1563
+ mnt_path: blob_path,
1564
+ })
1555
1565
  else:
1556
1566
  with ux_utils.print_exception_no_traceback():
1557
1567
  raise ValueError(f'Storage Type {store_type} '
@@ -156,6 +156,7 @@ setup_commands:
156
156
  echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
157
157
  {%- endfor %}
158
158
  {%- endif %}
159
+ IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
159
160
  sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
160
161
  sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
161
162
  mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
282
282
  if resources_str_full is not None:
283
283
  resources_str = resources_str_full
284
284
  if resources_str is None:
285
- resources_str = resources_utils.get_readable_resources_repr(
286
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
287
293
 
288
294
  return resources_str
289
295
  return '-'
@@ -8,6 +8,7 @@ import multiprocessing
8
8
  import os
9
9
  import subprocess
10
10
  import sys
11
+ import time
11
12
  import typing
12
13
  from typing import Any, Callable, IO, Optional, Tuple, TypeVar
13
14
 
@@ -18,6 +19,7 @@ from sky.utils import context
18
19
  from sky.utils import subprocess_utils
19
20
 
20
21
  StreamHandler = Callable[[IO[Any], IO[Any]], str]
22
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
21
23
 
22
24
  logger = sky_logging.init_logger(__name__)
23
25
 
@@ -46,6 +48,7 @@ def hijack_sys_attrs():
46
48
 
47
49
  def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
48
50
  """Passthrough the stream from the process to the output stream"""
51
+ last_flush_time = time.time()
49
52
  wrapped = io.TextIOWrapper(in_stream,
50
53
  encoding='utf-8',
51
54
  newline='',
@@ -55,9 +58,18 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
55
58
  line = wrapped.readline()
56
59
  if line:
57
60
  out_stream.write(line)
58
- out_stream.flush()
61
+
62
+ # Flush based on timeout instead of on every line
63
+ current_time = time.time()
64
+ if (current_time - last_flush_time >=
65
+ PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
66
+ out_stream.flush()
67
+ last_flush_time = current_time
59
68
  else:
60
69
  break
70
+
71
+ # Final flush to ensure all data is written
72
+ out_stream.flush()
61
73
  return ''
62
74
 
63
75
 
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
181
181
 
182
182
 
183
183
  def format_resource(resource: 'resources_lib.Resources',
184
- simplify: bool = False) -> str:
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
185
  resource = resource.assert_launchable()
186
- vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
187
- resource.instance_type)
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
188
190
 
189
- components = []
191
+ elements_simple = []
192
+ elements_full = []
190
193
 
191
194
  if resource.accelerators is not None:
192
195
  acc, count = list(resource.accelerators.items())[0]
193
- components.append(f'gpus={acc}:{count}')
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
194
198
 
195
- is_k8s = str(resource.cloud).lower() == 'kubernetes'
196
- if (resource.accelerators is None or is_k8s or not simplify):
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
197
207
  if vcpu is not None:
198
- components.append(f'cpus={int(vcpu)}')
208
+ elements_full.append(f'cpus={int(vcpu)}')
199
209
  if mem is not None:
200
- components.append(f'mem={int(mem)}')
210
+ elements_full.append(f'mem={int(mem)}')
201
211
 
202
- instance_type = resource.instance_type
203
- if simplify:
204
- instance_type = common_utils.truncate_long_string(instance_type, 15)
205
212
  if not is_k8s:
206
- components.append(instance_type)
207
- if simplify:
208
- components.append('...')
209
- else:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
210
220
  image_id = resource.image_id
211
221
  if image_id is not None:
212
222
  if None in image_id:
213
- components.append(f'image_id={image_id[None]}')
223
+ elements_full.append(f'image_id={image_id[None]}')
214
224
  else:
215
- components.append(f'image_id={image_id}')
216
- components.append(f'disk={resource.disk_size}')
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
217
227
  disk_tier = resource.disk_tier
218
228
  if disk_tier is not None:
219
- components.append(f'disk_tier={disk_tier.value}')
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
220
230
  ports = resource.ports
221
231
  if ports is not None:
222
- components.append(f'ports={ports}')
232
+ elements_full.append(f'ports={ports}')
223
233
 
224
234
  spot = '[spot]' if resource.use_spot else ''
225
- return f'{spot}({"" if not components else ", ".join(components)})'
226
-
227
-
228
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
229
- simplify: bool = False) -> str:
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
239
+ else:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
230
252
  if (handle.launched_nodes is not None and
231
253
  handle.launched_resources is not None):
232
- return (f'{handle.launched_nodes}x'
233
- f'{format_resource(handle.launched_resources, simplify)}')
234
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
235
259
 
236
260
 
237
261
  def make_ray_custom_resources_str(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20251026
3
+ Version: 1.0.0.dev20251029
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -44,6 +44,7 @@ Requires-Dist: psutil
44
44
  Requires-Dist: pulp
45
45
  Requires-Dist: pyyaml!=5.4.*,>3.13
46
46
  Requires-Dist: ijson
47
+ Requires-Dist: orjson
47
48
  Requires-Dist: requests
48
49
  Requires-Dist: uvicorn[standard]<0.36.0,>=0.33.0
49
50
  Requires-Dist: fastapi
@@ -170,6 +171,21 @@ Requires-Dist: grpcio>=1.63.0; extra == "cloudflare"
170
171
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "cloudflare"
171
172
  Requires-Dist: aiosqlite; extra == "cloudflare"
172
173
  Requires-Dist: greenlet; extra == "cloudflare"
174
+ Provides-Extra: coreweave
175
+ Requires-Dist: awscli>=1.27.10; extra == "coreweave"
176
+ Requires-Dist: botocore>=1.29.10; extra == "coreweave"
177
+ Requires-Dist: boto3>=1.26.1; extra == "coreweave"
178
+ Requires-Dist: colorama<0.4.5; extra == "coreweave"
179
+ Requires-Dist: casbin; extra == "coreweave"
180
+ Requires-Dist: sqlalchemy_adapter; extra == "coreweave"
181
+ Requires-Dist: passlib; extra == "coreweave"
182
+ Requires-Dist: pyjwt; extra == "coreweave"
183
+ Requires-Dist: aiohttp; extra == "coreweave"
184
+ Requires-Dist: anyio; extra == "coreweave"
185
+ Requires-Dist: grpcio>=1.63.0; extra == "coreweave"
186
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "coreweave"
187
+ Requires-Dist: aiosqlite; extra == "coreweave"
188
+ Requires-Dist: greenlet; extra == "coreweave"
173
189
  Provides-Extra: scp
174
190
  Requires-Dist: ray[default]>=2.6.1; extra == "scp"
175
191
  Requires-Dist: casbin; extra == "scp"
@@ -371,51 +387,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "shadeform"
371
387
  Requires-Dist: aiosqlite; extra == "shadeform"
372
388
  Requires-Dist: greenlet; extra == "shadeform"
373
389
  Provides-Extra: all
374
- Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
375
- Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
376
- Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
390
+ Requires-Dist: msgraph-sdk; extra == "all"
377
391
  Requires-Dist: ibm-cloud-sdk-core; extra == "all"
378
- Requires-Dist: botocore>=1.29.10; extra == "all"
379
- Requires-Dist: websockets; extra == "all"
380
- Requires-Dist: msrestazure; extra == "all"
381
- Requires-Dist: colorama<0.4.5; extra == "all"
382
- Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
383
- Requires-Dist: ecsapi>=0.2.0; extra == "all"
392
+ Requires-Dist: ibm-vpc; extra == "all"
393
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
394
+ Requires-Dist: awscli>=1.27.10; extra == "all"
395
+ Requires-Dist: sqlalchemy_adapter; extra == "all"
396
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
397
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
398
+ Requires-Dist: ray[default]>=2.6.1; extra == "all"
384
399
  Requires-Dist: nebius>=0.2.47; extra == "all"
400
+ Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
401
+ Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
402
+ Requires-Dist: casbin; extra == "all"
385
403
  Requires-Dist: grpcio>=1.63.0; extra == "all"
386
- Requires-Dist: awscli>=1.27.10; extra == "all"
404
+ Requires-Dist: aiosqlite; extra == "all"
405
+ Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
387
406
  Requires-Dist: runpod>=1.6.1; extra == "all"
407
+ Requires-Dist: greenlet; extra == "all"
388
408
  Requires-Dist: azure-common; extra == "all"
389
- Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
390
- Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
409
+ Requires-Dist: colorama<0.4.5; extra == "all"
391
410
  Requires-Dist: google-cloud-storage; extra == "all"
392
- Requires-Dist: msgraph-sdk; extra == "all"
393
- Requires-Dist: ray[default]>=2.6.1; extra == "all"
394
- Requires-Dist: boto3>=1.26.1; extra == "all"
395
- Requires-Dist: azure-core>=1.31.0; extra == "all"
396
- Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
397
- Requires-Dist: azure-identity>=1.19.0; extra == "all"
398
- Requires-Dist: oci; extra == "all"
399
- Requires-Dist: greenlet; extra == "all"
411
+ Requires-Dist: websockets; extra == "all"
412
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
413
+ Requires-Dist: msrestazure; extra == "all"
400
414
  Requires-Dist: tomli; python_version < "3.11" and extra == "all"
401
- Requires-Dist: sqlalchemy_adapter; extra == "all"
402
- Requires-Dist: anyio; extra == "all"
403
- Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
404
- Requires-Dist: ibm-vpc; extra == "all"
415
+ Requires-Dist: ecsapi>=0.2.0; extra == "all"
416
+ Requires-Dist: python-dateutil; extra == "all"
405
417
  Requires-Dist: passlib; extra == "all"
418
+ Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
419
+ Requires-Dist: docker; extra == "all"
420
+ Requires-Dist: anyio; extra == "all"
406
421
  Requires-Dist: ibm-cos-sdk; extra == "all"
407
422
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
408
- Requires-Dist: aiosqlite; extra == "all"
409
- Requires-Dist: azure-core>=1.24.0; extra == "all"
410
- Requires-Dist: aiohttp; extra == "all"
411
- Requires-Dist: docker; extra == "all"
412
- Requires-Dist: pydo>=0.3.0; extra == "all"
413
- Requires-Dist: casbin; extra == "all"
414
423
  Requires-Dist: pyjwt; extra == "all"
415
- Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
416
- Requires-Dist: python-dateutil; extra == "all"
424
+ Requires-Dist: oci; extra == "all"
425
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
417
426
  Requires-Dist: cudo-compute>=0.1.10; extra == "all"
427
+ Requires-Dist: azure-core>=1.24.0; extra == "all"
428
+ Requires-Dist: aiohttp; extra == "all"
429
+ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
430
+ Requires-Dist: botocore>=1.29.10; extra == "all"
418
431
  Requires-Dist: azure-cli>=2.65.0; extra == "all"
432
+ Requires-Dist: boto3>=1.26.1; extra == "all"
433
+ Requires-Dist: pydo>=0.3.0; extra == "all"
434
+ Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
419
435
  Provides-Extra: remote
420
436
  Requires-Dist: grpcio>=1.63.0; extra == "remote"
421
437
  Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "remote"
@@ -479,7 +495,7 @@ Dynamic: summary
479
495
  ----
480
496
 
481
497
  :fire: *News* :fire:
482
- - [Jul 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
498
+ - [Oct 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
483
499
  - [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
484
500
  - [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
485
501
  - [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)