skypilot-nightly 1.0.0.dev20251026__py3-none-any.whl → 1.0.0.dev20251029__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +34 -15
- sky/client/sdk.py +4 -4
- sky/cloud_stores.py +73 -0
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{wDQ7aGvICzMNmjIaC37TT → DabuSAKsc_y0wyJxpTIdQ}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-4abaae354da0ba13.js → webpack-485984ca04e021d0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +39 -0
- sky/data/storage.py +166 -9
- sky/global_user_state.py +59 -83
- sky/jobs/server/server.py +2 -2
- sky/jobs/utils.py +5 -6
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +2 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +182 -84
- sky/server/requests/serializers/decoders.py +3 -3
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +34 -7
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/task.py +10 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/resources_utils.py +53 -29
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/METADATA +50 -34
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/RECORD +74 -73
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{wDQ7aGvICzMNmjIaC37TT → DabuSAKsc_y0wyJxpTIdQ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251026.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/top_level.txt +0 -0
|
@@ -60,13 +60,23 @@ def encode_status(
|
|
|
60
60
|
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
61
61
|
response = []
|
|
62
62
|
for cluster in clusters:
|
|
63
|
-
response_cluster = cluster.model_dump()
|
|
63
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
64
|
+
# These default setting is needed because last_use and status_updated_at
|
|
65
|
+
# used to be not optional.
|
|
66
|
+
# TODO(syang): remove this after v0.10.7 or v0.11.0
|
|
67
|
+
if 'last_use' not in response_cluster:
|
|
68
|
+
response_cluster['last_use'] = ''
|
|
69
|
+
if 'status_updated_at' not in response_cluster:
|
|
70
|
+
response_cluster['status_updated_at'] = 0
|
|
64
71
|
response_cluster['status'] = cluster['status'].value
|
|
65
72
|
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
73
|
cluster['handle'])
|
|
67
74
|
response_cluster['handle'] = pickle_and_encode(handle)
|
|
75
|
+
# TODO (syang) We still need to return this field for backwards
|
|
76
|
+
# compatibility.
|
|
77
|
+
# Remove this field at or after v0.10.7 or v0.11.0
|
|
68
78
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
69
|
-
|
|
79
|
+
None) # Always returns None.
|
|
70
80
|
response.append(response_cluster)
|
|
71
81
|
return response
|
|
72
82
|
|
|
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
206
216
|
@register_encoder('storage_ls')
|
|
207
217
|
def encode_storage_ls(
|
|
208
218
|
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
209
|
-
for storage_info in return_value
|
|
219
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
220
|
+
for storage_info in response_list:
|
|
210
221
|
storage_info['status'] = storage_info['status'].value
|
|
211
222
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
212
|
-
return
|
|
223
|
+
return response_list
|
|
213
224
|
|
|
214
225
|
|
|
215
226
|
@register_encoder('volume_list')
|
|
@@ -219,11 +230,11 @@ def encode_volume_list(
|
|
|
219
230
|
|
|
220
231
|
|
|
221
232
|
@register_encoder('job_status')
|
|
222
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
233
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
223
234
|
for job_id in return_value.keys():
|
|
224
235
|
if return_value[job_id] is not None:
|
|
225
236
|
return_value[job_id] = return_value[job_id].value
|
|
226
|
-
return return_value
|
|
237
|
+
return {str(k): v for k, v in return_value.items()}
|
|
227
238
|
|
|
228
239
|
|
|
229
240
|
@register_encoder('kubernetes_node_info')
|
|
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
|
|
|
235
246
|
@register_encoder('endpoints')
|
|
236
247
|
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
237
248
|
return {str(k): v for k, v in return_value.items()}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
252
|
+
def encode_realtime_gpu_availability(
|
|
253
|
+
return_value: List[Tuple[str,
|
|
254
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
255
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
256
|
+
# for JSON serialization.
|
|
257
|
+
result = []
|
|
258
|
+
for context, gpu_list in return_value:
|
|
259
|
+
gpu_availability_list = []
|
|
260
|
+
for gpu in gpu_list:
|
|
261
|
+
gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
|
|
262
|
+
gpu_availability_list.append(gpu_list_item)
|
|
263
|
+
result.append((context, gpu_availability_list))
|
|
264
|
+
return result
|
sky/server/server.py
CHANGED
|
@@ -25,6 +25,7 @@ import zipfile
|
|
|
25
25
|
import aiofiles
|
|
26
26
|
import anyio
|
|
27
27
|
import fastapi
|
|
28
|
+
from fastapi import responses as fastapi_responses
|
|
28
29
|
from fastapi.middleware import cors
|
|
29
30
|
import starlette.middleware.base
|
|
30
31
|
import uvloop
|
|
@@ -1497,10 +1498,27 @@ async def local_down(request: fastapi.Request,
|
|
|
1497
1498
|
)
|
|
1498
1499
|
|
|
1499
1500
|
|
|
1501
|
+
async def get_expanded_request_id(request_id: str) -> str:
|
|
1502
|
+
"""Gets the expanded request ID for a given request ID prefix."""
|
|
1503
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1504
|
+
request_id, fields=['request_id'])
|
|
1505
|
+
if request_tasks is None:
|
|
1506
|
+
raise fastapi.HTTPException(status_code=404,
|
|
1507
|
+
detail=f'Request {request_id!r} not found')
|
|
1508
|
+
if len(request_tasks) > 1:
|
|
1509
|
+
raise fastapi.HTTPException(status_code=400,
|
|
1510
|
+
detail=('Multiple requests found for '
|
|
1511
|
+
f'request ID prefix: {request_id}'))
|
|
1512
|
+
return request_tasks[0].request_id
|
|
1513
|
+
|
|
1514
|
+
|
|
1500
1515
|
# === API server related APIs ===
|
|
1501
|
-
@app.get('/api/get')
|
|
1516
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1502
1517
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1503
1518
|
"""Gets a request with a given request ID prefix."""
|
|
1519
|
+
# Validate request_id prefix matches a single request.
|
|
1520
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1521
|
+
|
|
1504
1522
|
while True:
|
|
1505
1523
|
req_status = await requests_lib.get_request_status_async(request_id)
|
|
1506
1524
|
if req_status is None:
|
|
@@ -1560,11 +1578,16 @@ async def stream(
|
|
|
1560
1578
|
clients, console for CLI/API clients), 'plain' (force plain text),
|
|
1561
1579
|
'html' (force HTML), or 'console' (force console)
|
|
1562
1580
|
"""
|
|
1581
|
+
# We need to save the user-supplied request ID for the response header.
|
|
1582
|
+
user_supplied_request_id = request_id
|
|
1563
1583
|
if request_id is not None and log_path is not None:
|
|
1564
1584
|
raise fastapi.HTTPException(
|
|
1565
1585
|
status_code=400,
|
|
1566
1586
|
detail='Only one of request_id and log_path can be provided')
|
|
1567
1587
|
|
|
1588
|
+
if request_id is not None:
|
|
1589
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1590
|
+
|
|
1568
1591
|
if request_id is None and log_path is None:
|
|
1569
1592
|
request_id = await requests_lib.get_latest_request_id_async()
|
|
1570
1593
|
if request_id is None:
|
|
@@ -1654,7 +1677,9 @@ async def stream(
|
|
|
1654
1677
|
'Transfer-Encoding': 'chunked'
|
|
1655
1678
|
}
|
|
1656
1679
|
if request_id is not None:
|
|
1657
|
-
headers[server_constants.STREAM_REQUEST_HEADER] =
|
|
1680
|
+
headers[server_constants.STREAM_REQUEST_HEADER] = (
|
|
1681
|
+
user_supplied_request_id
|
|
1682
|
+
if user_supplied_request_id else request_id)
|
|
1658
1683
|
|
|
1659
1684
|
return fastapi.responses.StreamingResponse(
|
|
1660
1685
|
content=stream_utils.log_streamer(request_id,
|
|
@@ -1676,7 +1701,7 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1676
1701
|
request_id=request.state.request_id,
|
|
1677
1702
|
request_name='api_cancel',
|
|
1678
1703
|
request_body=request_cancel_body,
|
|
1679
|
-
func=requests_lib.
|
|
1704
|
+
func=requests_lib.kill_requests_with_prefix,
|
|
1680
1705
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1681
1706
|
)
|
|
1682
1707
|
|
|
@@ -1684,7 +1709,7 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1684
1709
|
@app.get('/api/status')
|
|
1685
1710
|
async def api_status(
|
|
1686
1711
|
request_ids: Optional[List[str]] = fastapi.Query(
|
|
1687
|
-
None, description='Request
|
|
1712
|
+
None, description='Request ID prefixes to get status for.'),
|
|
1688
1713
|
all_status: bool = fastapi.Query(
|
|
1689
1714
|
False, description='Get finished requests as well.'),
|
|
1690
1715
|
limit: Optional[int] = fastapi.Query(
|
|
@@ -1711,10 +1736,12 @@ async def api_status(
|
|
|
1711
1736
|
else:
|
|
1712
1737
|
encoded_request_tasks = []
|
|
1713
1738
|
for request_id in request_ids:
|
|
1714
|
-
|
|
1715
|
-
|
|
1739
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1740
|
+
request_id)
|
|
1741
|
+
if request_tasks is None:
|
|
1716
1742
|
continue
|
|
1717
|
-
|
|
1743
|
+
for request_task in request_tasks:
|
|
1744
|
+
encoded_request_tasks.append(request_task.readable_encode())
|
|
1718
1745
|
return encoded_request_tasks
|
|
1719
1746
|
|
|
1720
1747
|
|
sky/server/stream_utils.py
CHANGED
|
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
25
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
26
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
27
27
|
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
_READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
|
|
29
|
+
|
|
28
30
|
# If a SHORT request has been stuck in pending for
|
|
29
31
|
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
30
32
|
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
@@ -235,6 +237,9 @@ async def _tail_log_file(
|
|
|
235
237
|
buffer_bytes = 0
|
|
236
238
|
last_flush_time = asyncio.get_event_loop().time()
|
|
237
239
|
|
|
240
|
+
# Read file in chunks instead of line-by-line for better performance
|
|
241
|
+
incomplete_line = b'' # Buffer for incomplete lines across chunks
|
|
242
|
+
|
|
238
243
|
async def flush_buffer() -> AsyncGenerator[str, None]:
|
|
239
244
|
nonlocal buffer, buffer_bytes, last_flush_time
|
|
240
245
|
if buffer:
|
|
@@ -255,8 +260,23 @@ async def _tail_log_file(
|
|
|
255
260
|
async for chunk in flush_buffer():
|
|
256
261
|
yield chunk
|
|
257
262
|
|
|
258
|
-
|
|
259
|
-
|
|
263
|
+
# Read file in chunks for better I/O performance
|
|
264
|
+
file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
|
|
265
|
+
if not file_chunk:
|
|
266
|
+
# Process any remaining incomplete line
|
|
267
|
+
if incomplete_line:
|
|
268
|
+
line_str = incomplete_line.decode('utf-8')
|
|
269
|
+
if plain_logs:
|
|
270
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
271
|
+
line_str, raise_for_mismatch=False)
|
|
272
|
+
if not is_payload:
|
|
273
|
+
buffer.append(line_str)
|
|
274
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
275
|
+
else:
|
|
276
|
+
buffer.append(line_str)
|
|
277
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
278
|
+
incomplete_line = b''
|
|
279
|
+
|
|
260
280
|
# Avoid checking the status too frequently to avoid overloading the
|
|
261
281
|
# DB.
|
|
262
282
|
should_check_status = (current_time -
|
|
@@ -328,16 +348,39 @@ async def _tail_log_file(
|
|
|
328
348
|
# performance but it helps avoid unnecessary heartbeat strings
|
|
329
349
|
# being printed when the client runs in an old version.
|
|
330
350
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
351
|
+
|
|
352
|
+
# Combine with any incomplete line from previous chunk
|
|
353
|
+
file_chunk = incomplete_line + file_chunk
|
|
354
|
+
incomplete_line = b''
|
|
355
|
+
|
|
356
|
+
# Split chunk into lines, preserving line structure
|
|
357
|
+
lines_bytes = file_chunk.split(b'\n')
|
|
358
|
+
|
|
359
|
+
# If chunk doesn't end with newline, the last element is incomplete
|
|
360
|
+
if file_chunk and not file_chunk.endswith(b'\n'):
|
|
361
|
+
incomplete_line = lines_bytes[-1]
|
|
362
|
+
lines_bytes = lines_bytes[:-1]
|
|
363
|
+
else:
|
|
364
|
+
# If ends with \n, split creates an empty last element we should
|
|
365
|
+
# ignore
|
|
366
|
+
if lines_bytes and lines_bytes[-1] == b'':
|
|
367
|
+
lines_bytes = lines_bytes[:-1]
|
|
368
|
+
|
|
369
|
+
# Process all complete lines in this chunk
|
|
370
|
+
for line_bytes in lines_bytes:
|
|
371
|
+
# Reconstruct line with newline (since split removed it)
|
|
372
|
+
line_str = line_bytes.decode('utf-8') + '\n'
|
|
373
|
+
|
|
374
|
+
if plain_logs:
|
|
375
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
376
|
+
line_str, raise_for_mismatch=False)
|
|
377
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
|
378
|
+
# sending invisible characters might be okay.
|
|
379
|
+
if is_payload:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
buffer.append(line_str)
|
|
383
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
341
384
|
|
|
342
385
|
# Flush remaining lines in the buffer.
|
|
343
386
|
async for chunk in flush_buffer():
|
|
@@ -373,7 +416,7 @@ def stream_response(
|
|
|
373
416
|
async def on_disconnect():
|
|
374
417
|
logger.info(f'User terminated the connection for request '
|
|
375
418
|
f'{request_id}')
|
|
376
|
-
requests_lib.
|
|
419
|
+
await requests_lib.kill_request_async(request_id)
|
|
377
420
|
|
|
378
421
|
# The background task will be run after returning a response.
|
|
379
422
|
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -49,6 +49,7 @@ install_requires = [
|
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
51
|
'ijson',
|
|
52
|
+
'orjson',
|
|
52
53
|
'requests',
|
|
53
54
|
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
54
55
|
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
187
188
|
'docker': ['docker'] + local_ray,
|
|
188
189
|
'lambda': [], # No dependencies needed for lambda
|
|
189
190
|
'cloudflare': aws_dependencies,
|
|
191
|
+
'coreweave': aws_dependencies,
|
|
190
192
|
'scp': local_ray,
|
|
191
193
|
'oci': ['oci'],
|
|
192
194
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
sky/task.py
CHANGED
|
@@ -1552,6 +1552,16 @@ class Task:
|
|
|
1552
1552
|
self.update_file_mounts({
|
|
1553
1553
|
mnt_path: blob_path,
|
|
1554
1554
|
})
|
|
1555
|
+
elif store_type is storage_lib.StoreType.COREWEAVE:
|
|
1556
|
+
if storage.source is not None and not isinstance(
|
|
1557
|
+
storage.source,
|
|
1558
|
+
list) and storage.source.startswith('cw://'):
|
|
1559
|
+
blob_path = storage.source
|
|
1560
|
+
else:
|
|
1561
|
+
blob_path = 'cw://' + storage.name
|
|
1562
|
+
self.update_file_mounts({
|
|
1563
|
+
mnt_path: blob_path,
|
|
1564
|
+
})
|
|
1555
1565
|
else:
|
|
1556
1566
|
with ux_utils.print_exception_no_traceback():
|
|
1557
1567
|
raise ValueError(f'Storage Type {store_type} '
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -156,6 +156,7 @@ setup_commands:
|
|
|
156
156
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
157
157
|
{%- endfor %}
|
|
158
158
|
{%- endif %}
|
|
159
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
159
160
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
160
161
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
161
162
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
|
|
|
282
282
|
if resources_str_full is not None:
|
|
283
283
|
resources_str = resources_str_full
|
|
284
284
|
if resources_str is None:
|
|
285
|
-
|
|
286
|
-
|
|
285
|
+
resources_str_simple, resources_str_full = (
|
|
286
|
+
resources_utils.get_readable_resources_repr(
|
|
287
|
+
handle, simplified_only=truncate))
|
|
288
|
+
if truncate:
|
|
289
|
+
resources_str = resources_str_simple
|
|
290
|
+
else:
|
|
291
|
+
assert resources_str_full is not None
|
|
292
|
+
resources_str = resources_str_full
|
|
287
293
|
|
|
288
294
|
return resources_str
|
|
289
295
|
return '-'
|
sky/utils/context_utils.py
CHANGED
|
@@ -8,6 +8,7 @@ import multiprocessing
|
|
|
8
8
|
import os
|
|
9
9
|
import subprocess
|
|
10
10
|
import sys
|
|
11
|
+
import time
|
|
11
12
|
import typing
|
|
12
13
|
from typing import Any, Callable, IO, Optional, Tuple, TypeVar
|
|
13
14
|
|
|
@@ -18,6 +19,7 @@ from sky.utils import context
|
|
|
18
19
|
from sky.utils import subprocess_utils
|
|
19
20
|
|
|
20
21
|
StreamHandler = Callable[[IO[Any], IO[Any]], str]
|
|
22
|
+
PASSTHROUGH_FLUSH_INTERVAL_SECONDS = 0.5
|
|
21
23
|
|
|
22
24
|
logger = sky_logging.init_logger(__name__)
|
|
23
25
|
|
|
@@ -46,6 +48,7 @@ def hijack_sys_attrs():
|
|
|
46
48
|
|
|
47
49
|
def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
48
50
|
"""Passthrough the stream from the process to the output stream"""
|
|
51
|
+
last_flush_time = time.time()
|
|
49
52
|
wrapped = io.TextIOWrapper(in_stream,
|
|
50
53
|
encoding='utf-8',
|
|
51
54
|
newline='',
|
|
@@ -55,9 +58,18 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
|
55
58
|
line = wrapped.readline()
|
|
56
59
|
if line:
|
|
57
60
|
out_stream.write(line)
|
|
58
|
-
|
|
61
|
+
|
|
62
|
+
# Flush based on timeout instead of on every line
|
|
63
|
+
current_time = time.time()
|
|
64
|
+
if (current_time - last_flush_time >=
|
|
65
|
+
PASSTHROUGH_FLUSH_INTERVAL_SECONDS):
|
|
66
|
+
out_stream.flush()
|
|
67
|
+
last_flush_time = current_time
|
|
59
68
|
else:
|
|
60
69
|
break
|
|
70
|
+
|
|
71
|
+
# Final flush to ensure all data is written
|
|
72
|
+
out_stream.flush()
|
|
61
73
|
return ''
|
|
62
74
|
|
|
63
75
|
|
sky/utils/resources_utils.py
CHANGED
|
@@ -181,57 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
|
|
|
181
181
|
|
|
182
182
|
|
|
183
183
|
def format_resource(resource: 'resources_lib.Resources',
|
|
184
|
-
|
|
184
|
+
simplified_only: bool = False) -> Tuple[str, Optional[str]]:
|
|
185
185
|
resource = resource.assert_launchable()
|
|
186
|
-
|
|
187
|
-
|
|
186
|
+
is_k8s = str(resource.cloud).lower() == 'kubernetes'
|
|
187
|
+
if resource.accelerators is None or is_k8s or not simplified_only:
|
|
188
|
+
vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
|
|
189
|
+
resource.instance_type)
|
|
188
190
|
|
|
189
|
-
|
|
191
|
+
elements_simple = []
|
|
192
|
+
elements_full = []
|
|
190
193
|
|
|
191
194
|
if resource.accelerators is not None:
|
|
192
195
|
acc, count = list(resource.accelerators.items())[0]
|
|
193
|
-
|
|
196
|
+
elements_simple.append(f'gpus={acc}:{count}')
|
|
197
|
+
elements_full.append(f'gpus={acc}:{count}')
|
|
194
198
|
|
|
195
|
-
|
|
196
|
-
|
|
199
|
+
if (resource.accelerators is None or is_k8s):
|
|
200
|
+
if vcpu is not None:
|
|
201
|
+
elements_simple.append(f'cpus={int(vcpu)}')
|
|
202
|
+
elements_full.append(f'cpus={int(vcpu)}')
|
|
203
|
+
if mem is not None:
|
|
204
|
+
elements_simple.append(f'mem={int(mem)}')
|
|
205
|
+
elements_full.append(f'mem={int(mem)}')
|
|
206
|
+
elif not simplified_only:
|
|
197
207
|
if vcpu is not None:
|
|
198
|
-
|
|
208
|
+
elements_full.append(f'cpus={int(vcpu)}')
|
|
199
209
|
if mem is not None:
|
|
200
|
-
|
|
210
|
+
elements_full.append(f'mem={int(mem)}')
|
|
201
211
|
|
|
202
|
-
instance_type = resource.instance_type
|
|
203
|
-
if simplify:
|
|
204
|
-
instance_type = common_utils.truncate_long_string(instance_type, 15)
|
|
205
212
|
if not is_k8s:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
213
|
+
instance_type_full = resource.instance_type
|
|
214
|
+
instance_type_simple = common_utils.truncate_long_string(
|
|
215
|
+
instance_type_full, 15)
|
|
216
|
+
elements_simple.append(instance_type_simple)
|
|
217
|
+
elements_full.append(instance_type_full)
|
|
218
|
+
elements_simple.append('...')
|
|
219
|
+
if not simplified_only:
|
|
210
220
|
image_id = resource.image_id
|
|
211
221
|
if image_id is not None:
|
|
212
222
|
if None in image_id:
|
|
213
|
-
|
|
223
|
+
elements_full.append(f'image_id={image_id[None]}')
|
|
214
224
|
else:
|
|
215
|
-
|
|
216
|
-
|
|
225
|
+
elements_full.append(f'image_id={image_id}')
|
|
226
|
+
elements_full.append(f'disk={resource.disk_size}')
|
|
217
227
|
disk_tier = resource.disk_tier
|
|
218
228
|
if disk_tier is not None:
|
|
219
|
-
|
|
229
|
+
elements_full.append(f'disk_tier={disk_tier.value}')
|
|
220
230
|
ports = resource.ports
|
|
221
231
|
if ports is not None:
|
|
222
|
-
|
|
232
|
+
elements_full.append(f'ports={ports}')
|
|
223
233
|
|
|
224
234
|
spot = '[spot]' if resource.use_spot else ''
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
235
|
+
resources_str_simple = (
|
|
236
|
+
f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
|
|
237
|
+
if simplified_only:
|
|
238
|
+
return resources_str_simple, None
|
|
239
|
+
else:
|
|
240
|
+
resources_str_full = (
|
|
241
|
+
f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
|
|
242
|
+
return resources_str_simple, resources_str_full
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def get_readable_resources_repr(
|
|
246
|
+
handle: 'backends.CloudVmRayResourceHandle',
|
|
247
|
+
simplified_only: bool = False) -> Tuple[str, Optional[str]]:
|
|
248
|
+
resource_str_simple, resource_str_full = format_resource(
|
|
249
|
+
handle.launched_resources, simplified_only)
|
|
250
|
+
if not simplified_only:
|
|
251
|
+
assert resource_str_full is not None
|
|
230
252
|
if (handle.launched_nodes is not None and
|
|
231
253
|
handle.launched_resources is not None):
|
|
232
|
-
return (f'{handle.launched_nodes}x'
|
|
233
|
-
|
|
234
|
-
|
|
254
|
+
return (f'{handle.launched_nodes}x{resource_str_simple}',
|
|
255
|
+
None if simplified_only else
|
|
256
|
+
f'{handle.launched_nodes}x{resource_str_full}')
|
|
257
|
+
return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
|
|
258
|
+
_DEFAULT_MESSAGE_HANDLE_INITIALIZING)
|
|
235
259
|
|
|
236
260
|
|
|
237
261
|
def make_ray_custom_resources_str(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20251029
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -44,6 +44,7 @@ Requires-Dist: psutil
|
|
|
44
44
|
Requires-Dist: pulp
|
|
45
45
|
Requires-Dist: pyyaml!=5.4.*,>3.13
|
|
46
46
|
Requires-Dist: ijson
|
|
47
|
+
Requires-Dist: orjson
|
|
47
48
|
Requires-Dist: requests
|
|
48
49
|
Requires-Dist: uvicorn[standard]<0.36.0,>=0.33.0
|
|
49
50
|
Requires-Dist: fastapi
|
|
@@ -170,6 +171,21 @@ Requires-Dist: grpcio>=1.63.0; extra == "cloudflare"
|
|
|
170
171
|
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "cloudflare"
|
|
171
172
|
Requires-Dist: aiosqlite; extra == "cloudflare"
|
|
172
173
|
Requires-Dist: greenlet; extra == "cloudflare"
|
|
174
|
+
Provides-Extra: coreweave
|
|
175
|
+
Requires-Dist: awscli>=1.27.10; extra == "coreweave"
|
|
176
|
+
Requires-Dist: botocore>=1.29.10; extra == "coreweave"
|
|
177
|
+
Requires-Dist: boto3>=1.26.1; extra == "coreweave"
|
|
178
|
+
Requires-Dist: colorama<0.4.5; extra == "coreweave"
|
|
179
|
+
Requires-Dist: casbin; extra == "coreweave"
|
|
180
|
+
Requires-Dist: sqlalchemy_adapter; extra == "coreweave"
|
|
181
|
+
Requires-Dist: passlib; extra == "coreweave"
|
|
182
|
+
Requires-Dist: pyjwt; extra == "coreweave"
|
|
183
|
+
Requires-Dist: aiohttp; extra == "coreweave"
|
|
184
|
+
Requires-Dist: anyio; extra == "coreweave"
|
|
185
|
+
Requires-Dist: grpcio>=1.63.0; extra == "coreweave"
|
|
186
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "coreweave"
|
|
187
|
+
Requires-Dist: aiosqlite; extra == "coreweave"
|
|
188
|
+
Requires-Dist: greenlet; extra == "coreweave"
|
|
173
189
|
Provides-Extra: scp
|
|
174
190
|
Requires-Dist: ray[default]>=2.6.1; extra == "scp"
|
|
175
191
|
Requires-Dist: casbin; extra == "scp"
|
|
@@ -371,51 +387,51 @@ Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "shadeform"
|
|
|
371
387
|
Requires-Dist: aiosqlite; extra == "shadeform"
|
|
372
388
|
Requires-Dist: greenlet; extra == "shadeform"
|
|
373
389
|
Provides-Extra: all
|
|
374
|
-
Requires-Dist:
|
|
375
|
-
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
376
|
-
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
390
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
377
391
|
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
378
|
-
Requires-Dist:
|
|
379
|
-
Requires-Dist:
|
|
380
|
-
Requires-Dist:
|
|
381
|
-
Requires-Dist:
|
|
382
|
-
Requires-Dist: azure-mgmt-
|
|
383
|
-
Requires-Dist:
|
|
392
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
|
393
|
+
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
394
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
395
|
+
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
396
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
397
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
398
|
+
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
384
399
|
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
400
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
401
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
402
|
+
Requires-Dist: casbin; extra == "all"
|
|
385
403
|
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
386
|
-
Requires-Dist:
|
|
404
|
+
Requires-Dist: aiosqlite; extra == "all"
|
|
405
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
387
406
|
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
407
|
+
Requires-Dist: greenlet; extra == "all"
|
|
388
408
|
Requires-Dist: azure-common; extra == "all"
|
|
389
|
-
Requires-Dist:
|
|
390
|
-
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
409
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
391
410
|
Requires-Dist: google-cloud-storage; extra == "all"
|
|
392
|
-
Requires-Dist:
|
|
393
|
-
Requires-Dist:
|
|
394
|
-
Requires-Dist:
|
|
395
|
-
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
396
|
-
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
397
|
-
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
398
|
-
Requires-Dist: oci; extra == "all"
|
|
399
|
-
Requires-Dist: greenlet; extra == "all"
|
|
411
|
+
Requires-Dist: websockets; extra == "all"
|
|
412
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
413
|
+
Requires-Dist: msrestazure; extra == "all"
|
|
400
414
|
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
401
|
-
Requires-Dist:
|
|
402
|
-
Requires-Dist:
|
|
403
|
-
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
404
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
|
415
|
+
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
416
|
+
Requires-Dist: python-dateutil; extra == "all"
|
|
405
417
|
Requires-Dist: passlib; extra == "all"
|
|
418
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
419
|
+
Requires-Dist: docker; extra == "all"
|
|
420
|
+
Requires-Dist: anyio; extra == "all"
|
|
406
421
|
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
407
422
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
408
|
-
Requires-Dist: aiosqlite; extra == "all"
|
|
409
|
-
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
410
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
411
|
-
Requires-Dist: docker; extra == "all"
|
|
412
|
-
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
413
|
-
Requires-Dist: casbin; extra == "all"
|
|
414
423
|
Requires-Dist: pyjwt; extra == "all"
|
|
415
|
-
Requires-Dist:
|
|
416
|
-
Requires-Dist:
|
|
424
|
+
Requires-Dist: oci; extra == "all"
|
|
425
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
417
426
|
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
427
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
428
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
429
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
430
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
418
431
|
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
432
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
433
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
434
|
+
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
419
435
|
Provides-Extra: remote
|
|
420
436
|
Requires-Dist: grpcio>=1.63.0; extra == "remote"
|
|
421
437
|
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "remote"
|
|
@@ -479,7 +495,7 @@ Dynamic: summary
|
|
|
479
495
|
----
|
|
480
496
|
|
|
481
497
|
:fire: *News* :fire:
|
|
482
|
-
- [
|
|
498
|
+
- [Oct 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
|
|
483
499
|
- [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
|
|
484
500
|
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
|
|
485
501
|
- [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
|