skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251029__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/coreweave.py +278 -0
- sky/backends/backend_utils.py +9 -6
- sky/backends/cloud_vm_ray_backend.py +2 -3
- sky/check.py +25 -13
- sky/client/cli/command.py +5 -1
- sky/cloud_stores.py +73 -0
- sky/core.py +7 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-485984ca04e021d0.js} +1 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +39 -0
- sky/data/storage.py +166 -9
- sky/global_user_state.py +14 -18
- sky/jobs/server/server.py +2 -2
- sky/jobs/utils.py +5 -6
- sky/optimizer.py +1 -1
- sky/provision/kubernetes/instance.py +88 -19
- sky/provision/kubernetes/volume.py +2 -2
- sky/schemas/api/responses.py +2 -5
- sky/serve/replica_managers.py +2 -2
- sky/serve/serve_utils.py +9 -2
- sky/server/requests/payloads.py +2 -0
- sky/server/requests/requests.py +137 -102
- sky/server/requests/serializers/decoders.py +0 -6
- sky/server/requests/serializers/encoders.py +33 -6
- sky/server/server.py +2 -1
- sky/server/stream_utils.py +56 -13
- sky/setup_files/dependencies.py +2 -0
- sky/task.py +10 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/context_utils.py +13 -1
- sky/utils/resources_utils.py +53 -29
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/METADATA +52 -36
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/RECORD +73 -72
- sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
- sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
- /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/top_level.txt +0 -0
sky/schemas/api/responses.py
CHANGED
|
@@ -90,7 +90,7 @@ class StatusResponse(ResponseBaseModel):
|
|
|
90
90
|
# This is an internally facing field anyway, so it's less
|
|
91
91
|
# of a problem that it's not typed.
|
|
92
92
|
handle: Optional[Any] = None
|
|
93
|
-
last_use: str
|
|
93
|
+
last_use: Optional[str] = None
|
|
94
94
|
status: status_lib.ClusterStatus
|
|
95
95
|
autostop: int
|
|
96
96
|
to_down: bool
|
|
@@ -98,11 +98,8 @@ class StatusResponse(ResponseBaseModel):
|
|
|
98
98
|
# metadata is a JSON, so we use Any here.
|
|
99
99
|
metadata: Optional[Dict[str, Any]] = None
|
|
100
100
|
cluster_hash: str
|
|
101
|
-
# pydantic cannot generate the pydantic-core schema for
|
|
102
|
-
# storage_mounts_metadata, so we use Any here.
|
|
103
|
-
storage_mounts_metadata: Optional[Dict[str, Any]] = None
|
|
104
101
|
cluster_ever_up: bool
|
|
105
|
-
status_updated_at: int
|
|
102
|
+
status_updated_at: Optional[int] = None
|
|
106
103
|
user_hash: str
|
|
107
104
|
user_name: str
|
|
108
105
|
config_hash: Optional[str] = None
|
sky/serve/replica_managers.py
CHANGED
|
@@ -495,8 +495,8 @@ class ReplicaInfo:
|
|
|
495
495
|
info_dict['cloud'] = repr(handle.launched_resources.cloud)
|
|
496
496
|
info_dict['region'] = handle.launched_resources.region
|
|
497
497
|
info_dict['resources_str'] = (
|
|
498
|
-
resources_utils.get_readable_resources_repr(
|
|
499
|
-
|
|
498
|
+
resources_utils.get_readable_resources_repr(
|
|
499
|
+
handle, simplified_only=True)[0])
|
|
500
500
|
return info_dict
|
|
501
501
|
|
|
502
502
|
def __repr__(self) -> str:
|
sky/serve/serve_utils.py
CHANGED
|
@@ -1550,8 +1550,15 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1550
1550
|
'handle']
|
|
1551
1551
|
if replica_handle is not None:
|
|
1552
1552
|
infra = replica_handle.launched_resources.infra.formatted_str()
|
|
1553
|
-
|
|
1554
|
-
|
|
1553
|
+
simplified = not show_all
|
|
1554
|
+
resources_str_simple, resources_str_full = (
|
|
1555
|
+
resources_utils.get_readable_resources_repr(
|
|
1556
|
+
replica_handle, simplified_only=simplified))
|
|
1557
|
+
if simplified:
|
|
1558
|
+
resources_str = resources_str_simple
|
|
1559
|
+
else:
|
|
1560
|
+
assert resources_str_full is not None
|
|
1561
|
+
resources_str = resources_str_full
|
|
1555
1562
|
|
|
1556
1563
|
replica_values = [
|
|
1557
1564
|
service_name,
|
sky/server/requests/payloads.py
CHANGED
|
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
|
|
|
319
319
|
# Only return fields that are needed for the
|
|
320
320
|
# dashboard / CLI summary response
|
|
321
321
|
summary_response: bool = False
|
|
322
|
+
# Include the cluster handle in the response
|
|
323
|
+
include_handle: bool = True
|
|
322
324
|
|
|
323
325
|
|
|
324
326
|
class StartBody(RequestBody):
|
sky/server/requests/requests.py
CHANGED
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
import enum
|
|
7
7
|
import functools
|
|
8
|
-
import json
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import shutil
|
|
@@ -21,6 +20,7 @@ import uuid
|
|
|
21
20
|
import anyio
|
|
22
21
|
import colorama
|
|
23
22
|
import filelock
|
|
23
|
+
import orjson
|
|
24
24
|
|
|
25
25
|
from sky import exceptions
|
|
26
26
|
from sky import global_user_state
|
|
@@ -213,8 +213,8 @@ class Request:
|
|
|
213
213
|
entrypoint=self.entrypoint.__name__,
|
|
214
214
|
request_body=self.request_body.model_dump_json(),
|
|
215
215
|
status=self.status.value,
|
|
216
|
-
return_value=
|
|
217
|
-
error=
|
|
216
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
217
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
218
218
|
pid=None,
|
|
219
219
|
created_at=self.created_at,
|
|
220
220
|
schedule_type=self.schedule_type.value,
|
|
@@ -237,8 +237,8 @@ class Request:
|
|
|
237
237
|
entrypoint=encoders.pickle_and_encode(self.entrypoint),
|
|
238
238
|
request_body=encoders.pickle_and_encode(self.request_body),
|
|
239
239
|
status=self.status.value,
|
|
240
|
-
return_value=
|
|
241
|
-
error=
|
|
240
|
+
return_value=orjson.dumps(self.return_value).decode('utf-8'),
|
|
241
|
+
error=orjson.dumps(self.error).decode('utf-8'),
|
|
242
242
|
pid=self.pid,
|
|
243
243
|
created_at=self.created_at,
|
|
244
244
|
schedule_type=self.schedule_type.value,
|
|
@@ -270,8 +270,8 @@ class Request:
|
|
|
270
270
|
entrypoint=decoders.decode_and_unpickle(payload.entrypoint),
|
|
271
271
|
request_body=decoders.decode_and_unpickle(payload.request_body),
|
|
272
272
|
status=RequestStatus(payload.status),
|
|
273
|
-
return_value=
|
|
274
|
-
error=
|
|
273
|
+
return_value=orjson.loads(payload.return_value),
|
|
274
|
+
error=orjson.loads(payload.error),
|
|
275
275
|
pid=payload.pid,
|
|
276
276
|
created_at=payload.created_at,
|
|
277
277
|
schedule_type=ScheduleType(payload.schedule_type),
|
|
@@ -328,10 +328,11 @@ def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
|
328
328
|
entrypoint=request.entrypoint.__name__
|
|
329
329
|
if request.entrypoint is not None else '',
|
|
330
330
|
request_body=request.request_body.model_dump_json()
|
|
331
|
-
if request.request_body is not None else
|
|
331
|
+
if request.request_body is not None else
|
|
332
|
+
orjson.dumps(None).decode('utf-8'),
|
|
332
333
|
status=request.status.value,
|
|
333
|
-
return_value=
|
|
334
|
-
error=
|
|
334
|
+
return_value=orjson.dumps(None).decode('utf-8'),
|
|
335
|
+
error=orjson.dumps(None).decode('utf-8'),
|
|
335
336
|
pid=None,
|
|
336
337
|
created_at=request.created_at,
|
|
337
338
|
schedule_type=request.schedule_type.value,
|
|
@@ -372,9 +373,9 @@ def _update_request_row_fields(
|
|
|
372
373
|
if 'user_id' not in fields:
|
|
373
374
|
content['user_id'] = ''
|
|
374
375
|
if 'return_value' not in fields:
|
|
375
|
-
content['return_value'] =
|
|
376
|
+
content['return_value'] = orjson.dumps(None).decode('utf-8')
|
|
376
377
|
if 'error' not in fields:
|
|
377
|
-
content['error'] =
|
|
378
|
+
content['error'] = orjson.dumps(None).decode('utf-8')
|
|
378
379
|
if 'schedule_type' not in fields:
|
|
379
380
|
content['schedule_type'] = ScheduleType.SHORT.value
|
|
380
381
|
# Optional fields in RequestPayload
|
|
@@ -393,94 +394,6 @@ def _update_request_row_fields(
|
|
|
393
394
|
return tuple(content[col] for col in REQUEST_COLUMNS)
|
|
394
395
|
|
|
395
396
|
|
|
396
|
-
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
397
|
-
"""Kill all pending and running requests for a cluster.
|
|
398
|
-
|
|
399
|
-
Args:
|
|
400
|
-
cluster_name: the name of the cluster.
|
|
401
|
-
exclude_request_names: exclude requests with these names. This is to
|
|
402
|
-
prevent killing the caller request.
|
|
403
|
-
"""
|
|
404
|
-
request_ids = [
|
|
405
|
-
request_task.request_id
|
|
406
|
-
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
407
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
408
|
-
exclude_request_names=[exclude_request_name],
|
|
409
|
-
cluster_names=[cluster_name],
|
|
410
|
-
fields=['request_id']))
|
|
411
|
-
]
|
|
412
|
-
kill_requests(request_ids)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
|
|
416
|
-
user_id: Optional[str] = None) -> List[str]:
|
|
417
|
-
"""Kill requests with a given request ID prefix."""
|
|
418
|
-
expanded_request_ids: Optional[List[str]] = None
|
|
419
|
-
if request_ids is not None:
|
|
420
|
-
expanded_request_ids = []
|
|
421
|
-
for request_id in request_ids:
|
|
422
|
-
request_tasks = get_requests_with_prefix(request_id,
|
|
423
|
-
fields=['request_id'])
|
|
424
|
-
if request_tasks is None or len(request_tasks) == 0:
|
|
425
|
-
continue
|
|
426
|
-
if len(request_tasks) > 1:
|
|
427
|
-
raise ValueError(f'Multiple requests found for '
|
|
428
|
-
f'request ID prefix: {request_id}')
|
|
429
|
-
expanded_request_ids.append(request_tasks[0].request_id)
|
|
430
|
-
return kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
434
|
-
user_id: Optional[str] = None) -> List[str]:
|
|
435
|
-
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
436
|
-
|
|
437
|
-
Args:
|
|
438
|
-
request_ids: The request IDs to kill. If None, all requests for the
|
|
439
|
-
user are killed.
|
|
440
|
-
user_id: The user ID to kill requests for. If None, all users are
|
|
441
|
-
killed.
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
A list of request IDs that were cancelled.
|
|
445
|
-
"""
|
|
446
|
-
if request_ids is None:
|
|
447
|
-
request_ids = [
|
|
448
|
-
request_task.request_id
|
|
449
|
-
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
450
|
-
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
451
|
-
# Avoid cancelling the cancel request itself.
|
|
452
|
-
exclude_request_names=['sky.api_cancel'],
|
|
453
|
-
user_id=user_id,
|
|
454
|
-
fields=['request_id']))
|
|
455
|
-
]
|
|
456
|
-
cancelled_request_ids = []
|
|
457
|
-
for request_id in request_ids:
|
|
458
|
-
with update_request(request_id) as request_record:
|
|
459
|
-
if request_record is None:
|
|
460
|
-
logger.debug(f'No request ID {request_id}')
|
|
461
|
-
continue
|
|
462
|
-
# Skip internal requests. The internal requests are scheduled with
|
|
463
|
-
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
464
|
-
if request_record.request_id in set(
|
|
465
|
-
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
466
|
-
continue
|
|
467
|
-
if request_record.status > RequestStatus.RUNNING:
|
|
468
|
-
logger.debug(f'Request {request_id} already finished')
|
|
469
|
-
continue
|
|
470
|
-
if request_record.pid is not None:
|
|
471
|
-
logger.debug(f'Killing request process {request_record.pid}')
|
|
472
|
-
# Use SIGTERM instead of SIGKILL:
|
|
473
|
-
# - The executor can handle SIGTERM gracefully
|
|
474
|
-
# - After SIGTERM, the executor can reuse the request process
|
|
475
|
-
# for other requests, avoiding the overhead of forking a new
|
|
476
|
-
# process for each request.
|
|
477
|
-
os.kill(request_record.pid, signal.SIGTERM)
|
|
478
|
-
request_record.status = RequestStatus.CANCELLED
|
|
479
|
-
request_record.finished_at = time.time()
|
|
480
|
-
cancelled_request_ids.append(request_id)
|
|
481
|
-
return cancelled_request_ids
|
|
482
|
-
|
|
483
|
-
|
|
484
397
|
def create_table(cursor, conn):
|
|
485
398
|
# Enable WAL mode to avoid locking issues.
|
|
486
399
|
# See: issue #1441 and PR #1509
|
|
@@ -624,6 +537,128 @@ def request_lock_path(request_id: str) -> str:
|
|
|
624
537
|
return os.path.join(lock_path, f'.{request_id}.lock')
|
|
625
538
|
|
|
626
539
|
|
|
540
|
+
def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
541
|
+
"""Kill all pending and running requests for a cluster.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
cluster_name: the name of the cluster.
|
|
545
|
+
exclude_request_names: exclude requests with these names. This is to
|
|
546
|
+
prevent killing the caller request.
|
|
547
|
+
"""
|
|
548
|
+
request_ids = [
|
|
549
|
+
request_task.request_id
|
|
550
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
551
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
552
|
+
exclude_request_names=[exclude_request_name],
|
|
553
|
+
cluster_names=[cluster_name],
|
|
554
|
+
fields=['request_id']))
|
|
555
|
+
]
|
|
556
|
+
_kill_requests(request_ids)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def kill_requests_with_prefix(request_ids: Optional[List[str]] = None,
|
|
560
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
561
|
+
"""Kill requests with a given request ID prefix."""
|
|
562
|
+
expanded_request_ids: Optional[List[str]] = None
|
|
563
|
+
if request_ids is not None:
|
|
564
|
+
expanded_request_ids = []
|
|
565
|
+
for request_id in request_ids:
|
|
566
|
+
request_tasks = get_requests_with_prefix(request_id,
|
|
567
|
+
fields=['request_id'])
|
|
568
|
+
if request_tasks is None or len(request_tasks) == 0:
|
|
569
|
+
continue
|
|
570
|
+
if len(request_tasks) > 1:
|
|
571
|
+
raise ValueError(f'Multiple requests found for '
|
|
572
|
+
f'request ID prefix: {request_id}')
|
|
573
|
+
expanded_request_ids.append(request_tasks[0].request_id)
|
|
574
|
+
return _kill_requests(request_ids=expanded_request_ids, user_id=user_id)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _should_kill_request(request_id: str,
|
|
578
|
+
request_record: Optional[Request]) -> bool:
|
|
579
|
+
if request_record is None:
|
|
580
|
+
logger.debug(f'No request ID {request_id}')
|
|
581
|
+
return False
|
|
582
|
+
# Skip internal requests. The internal requests are scheduled with
|
|
583
|
+
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
584
|
+
if request_record.request_id in set(
|
|
585
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
586
|
+
return False
|
|
587
|
+
if request_record.status > RequestStatus.RUNNING:
|
|
588
|
+
logger.debug(f'Request {request_id} already finished')
|
|
589
|
+
return False
|
|
590
|
+
return True
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _kill_requests(request_ids: Optional[List[str]] = None,
|
|
594
|
+
user_id: Optional[str] = None) -> List[str]:
|
|
595
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
request_ids: The request IDs to kill. If None, all requests for the
|
|
599
|
+
user are killed.
|
|
600
|
+
user_id: The user ID to kill requests for. If None, all users are
|
|
601
|
+
killed.
|
|
602
|
+
|
|
603
|
+
Returns:
|
|
604
|
+
A list of request IDs that were cancelled.
|
|
605
|
+
"""
|
|
606
|
+
if request_ids is None:
|
|
607
|
+
request_ids = [
|
|
608
|
+
request_task.request_id
|
|
609
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
610
|
+
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
611
|
+
# Avoid cancelling the cancel request itself.
|
|
612
|
+
exclude_request_names=['sky.api_cancel'],
|
|
613
|
+
user_id=user_id,
|
|
614
|
+
fields=['request_id']))
|
|
615
|
+
]
|
|
616
|
+
cancelled_request_ids = []
|
|
617
|
+
for request_id in request_ids:
|
|
618
|
+
with update_request(request_id) as request_record:
|
|
619
|
+
if not _should_kill_request(request_id, request_record):
|
|
620
|
+
continue
|
|
621
|
+
if request_record.pid is not None:
|
|
622
|
+
logger.debug(f'Killing request process {request_record.pid}')
|
|
623
|
+
# Use SIGTERM instead of SIGKILL:
|
|
624
|
+
# - The executor can handle SIGTERM gracefully
|
|
625
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
626
|
+
# for other requests, avoiding the overhead of forking a new
|
|
627
|
+
# process for each request.
|
|
628
|
+
os.kill(request_record.pid, signal.SIGTERM)
|
|
629
|
+
request_record.status = RequestStatus.CANCELLED
|
|
630
|
+
request_record.finished_at = time.time()
|
|
631
|
+
cancelled_request_ids.append(request_id)
|
|
632
|
+
return cancelled_request_ids
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@init_db_async
|
|
636
|
+
@asyncio_utils.shield
|
|
637
|
+
async def kill_request_async(request_id: str) -> bool:
|
|
638
|
+
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
True if the request was killed, False otherwise.
|
|
642
|
+
"""
|
|
643
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
644
|
+
request = await _get_request_no_lock_async(request_id)
|
|
645
|
+
if not _should_kill_request(request_id, request):
|
|
646
|
+
return False
|
|
647
|
+
assert request is not None
|
|
648
|
+
if request.pid is not None:
|
|
649
|
+
logger.debug(f'Killing request process {request.pid}')
|
|
650
|
+
# Use SIGTERM instead of SIGKILL:
|
|
651
|
+
# - The executor can handle SIGTERM gracefully
|
|
652
|
+
# - After SIGTERM, the executor can reuse the request process
|
|
653
|
+
# for other requests, avoiding the overhead of forking a new
|
|
654
|
+
# process for each request.
|
|
655
|
+
os.kill(request.pid, signal.SIGTERM)
|
|
656
|
+
request.status = RequestStatus.CANCELLED
|
|
657
|
+
request.finished_at = time.time()
|
|
658
|
+
await _add_or_update_request_no_lock_async(request)
|
|
659
|
+
return True
|
|
660
|
+
|
|
661
|
+
|
|
627
662
|
@contextlib.contextmanager
|
|
628
663
|
@init_db
|
|
629
664
|
@metrics_lib.time_me
|
|
@@ -638,7 +673,7 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
|
638
673
|
_add_or_update_request_no_lock(request)
|
|
639
674
|
|
|
640
675
|
|
|
641
|
-
@
|
|
676
|
+
@init_db_async
|
|
642
677
|
@metrics_lib.time_me
|
|
643
678
|
@asyncio_utils.shield
|
|
644
679
|
async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
@@ -650,7 +685,7 @@ async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
|
650
685
|
await _add_or_update_request_no_lock_async(request)
|
|
651
686
|
|
|
652
687
|
|
|
653
|
-
@
|
|
688
|
+
@init_db_async
|
|
654
689
|
@metrics_lib.time_me
|
|
655
690
|
@asyncio_utils.shield
|
|
656
691
|
async def update_status_msg_async(request_id: str, status_msg: str) -> None:
|
|
@@ -60,12 +60,6 @@ def decode_status(
|
|
|
60
60
|
if 'handle' in cluster and cluster['handle'] is not None:
|
|
61
61
|
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
62
62
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
63
|
-
# this field is to be deprecated in the future.
|
|
64
|
-
# do not decode this field if it is not present.
|
|
65
|
-
if ('storage_mounts_metadata' in cluster and
|
|
66
|
-
cluster['storage_mounts_metadata'] is not None):
|
|
67
|
-
cluster['storage_mounts_metadata'] = decode_and_unpickle(
|
|
68
|
-
cluster['storage_mounts_metadata'])
|
|
69
63
|
if 'is_managed' not in cluster:
|
|
70
64
|
cluster['is_managed'] = False
|
|
71
65
|
response.append(responses.StatusResponse.model_validate(cluster))
|
|
@@ -60,13 +60,23 @@ def encode_status(
|
|
|
60
60
|
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
61
61
|
response = []
|
|
62
62
|
for cluster in clusters:
|
|
63
|
-
response_cluster = cluster.model_dump()
|
|
63
|
+
response_cluster = cluster.model_dump(exclude_none=True)
|
|
64
|
+
# These default setting is needed because last_use and status_updated_at
|
|
65
|
+
# used to be not optional.
|
|
66
|
+
# TODO(syang): remove this after v0.10.7 or v0.11.0
|
|
67
|
+
if 'last_use' not in response_cluster:
|
|
68
|
+
response_cluster['last_use'] = ''
|
|
69
|
+
if 'status_updated_at' not in response_cluster:
|
|
70
|
+
response_cluster['status_updated_at'] = 0
|
|
64
71
|
response_cluster['status'] = cluster['status'].value
|
|
65
72
|
handle = serialize_utils.prepare_handle_for_backwards_compatibility(
|
|
66
73
|
cluster['handle'])
|
|
67
74
|
response_cluster['handle'] = pickle_and_encode(handle)
|
|
75
|
+
# TODO (syang) We still need to return this field for backwards
|
|
76
|
+
# compatibility.
|
|
77
|
+
# Remove this field at or after v0.10.7 or v0.11.0
|
|
68
78
|
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
69
|
-
|
|
79
|
+
None) # Always returns None.
|
|
70
80
|
response.append(response_cluster)
|
|
71
81
|
return response
|
|
72
82
|
|
|
@@ -206,10 +216,11 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
|
|
|
206
216
|
@register_encoder('storage_ls')
|
|
207
217
|
def encode_storage_ls(
|
|
208
218
|
return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
|
|
209
|
-
for storage_info in return_value
|
|
219
|
+
response_list = [storage_info.model_dump() for storage_info in return_value]
|
|
220
|
+
for storage_info in response_list:
|
|
210
221
|
storage_info['status'] = storage_info['status'].value
|
|
211
222
|
storage_info['store'] = [store.value for store in storage_info['store']]
|
|
212
|
-
return
|
|
223
|
+
return response_list
|
|
213
224
|
|
|
214
225
|
|
|
215
226
|
@register_encoder('volume_list')
|
|
@@ -219,11 +230,11 @@ def encode_volume_list(
|
|
|
219
230
|
|
|
220
231
|
|
|
221
232
|
@register_encoder('job_status')
|
|
222
|
-
def encode_job_status(return_value: Dict[int, Any]) -> Dict[
|
|
233
|
+
def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
|
|
223
234
|
for job_id in return_value.keys():
|
|
224
235
|
if return_value[job_id] is not None:
|
|
225
236
|
return_value[job_id] = return_value[job_id].value
|
|
226
|
-
return return_value
|
|
237
|
+
return {str(k): v for k, v in return_value.items()}
|
|
227
238
|
|
|
228
239
|
|
|
229
240
|
@register_encoder('kubernetes_node_info')
|
|
@@ -235,3 +246,19 @@ def encode_kubernetes_node_info(
|
|
|
235
246
|
@register_encoder('endpoints')
|
|
236
247
|
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
237
248
|
return {str(k): v for k, v in return_value.items()}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@register_encoder('realtime_kubernetes_gpu_availability')
|
|
252
|
+
def encode_realtime_gpu_availability(
|
|
253
|
+
return_value: List[Tuple[str,
|
|
254
|
+
List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
|
|
255
|
+
# Convert RealtimeGpuAvailability namedtuples to lists
|
|
256
|
+
# for JSON serialization.
|
|
257
|
+
result = []
|
|
258
|
+
for context, gpu_list in return_value:
|
|
259
|
+
gpu_availability_list = []
|
|
260
|
+
for gpu in gpu_list:
|
|
261
|
+
gpu_list_item = [gpu.gpu, gpu.counts, gpu.capacity, gpu.available]
|
|
262
|
+
gpu_availability_list.append(gpu_list_item)
|
|
263
|
+
result.append((context, gpu_availability_list))
|
|
264
|
+
return result
|
sky/server/server.py
CHANGED
|
@@ -25,6 +25,7 @@ import zipfile
|
|
|
25
25
|
import aiofiles
|
|
26
26
|
import anyio
|
|
27
27
|
import fastapi
|
|
28
|
+
from fastapi import responses as fastapi_responses
|
|
28
29
|
from fastapi.middleware import cors
|
|
29
30
|
import starlette.middleware.base
|
|
30
31
|
import uvloop
|
|
@@ -1512,7 +1513,7 @@ async def get_expanded_request_id(request_id: str) -> str:
|
|
|
1512
1513
|
|
|
1513
1514
|
|
|
1514
1515
|
# === API server related APIs ===
|
|
1515
|
-
@app.get('/api/get')
|
|
1516
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1516
1517
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1517
1518
|
"""Gets a request with a given request ID prefix."""
|
|
1518
1519
|
# Validate request_id prefix matches a single request.
|
sky/server/stream_utils.py
CHANGED
|
@@ -25,6 +25,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
25
25
|
_BUFFER_SIZE = 8 * 1024 # 8KB
|
|
26
26
|
_BUFFER_TIMEOUT = 0.02 # 20ms
|
|
27
27
|
_HEARTBEAT_INTERVAL = 30
|
|
28
|
+
_READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
|
|
29
|
+
|
|
28
30
|
# If a SHORT request has been stuck in pending for
|
|
29
31
|
# _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
|
|
30
32
|
_SHORT_REQUEST_SPINNER_TIMEOUT = 2
|
|
@@ -235,6 +237,9 @@ async def _tail_log_file(
|
|
|
235
237
|
buffer_bytes = 0
|
|
236
238
|
last_flush_time = asyncio.get_event_loop().time()
|
|
237
239
|
|
|
240
|
+
# Read file in chunks instead of line-by-line for better performance
|
|
241
|
+
incomplete_line = b'' # Buffer for incomplete lines across chunks
|
|
242
|
+
|
|
238
243
|
async def flush_buffer() -> AsyncGenerator[str, None]:
|
|
239
244
|
nonlocal buffer, buffer_bytes, last_flush_time
|
|
240
245
|
if buffer:
|
|
@@ -255,8 +260,23 @@ async def _tail_log_file(
|
|
|
255
260
|
async for chunk in flush_buffer():
|
|
256
261
|
yield chunk
|
|
257
262
|
|
|
258
|
-
|
|
259
|
-
|
|
263
|
+
# Read file in chunks for better I/O performance
|
|
264
|
+
file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
|
|
265
|
+
if not file_chunk:
|
|
266
|
+
# Process any remaining incomplete line
|
|
267
|
+
if incomplete_line:
|
|
268
|
+
line_str = incomplete_line.decode('utf-8')
|
|
269
|
+
if plain_logs:
|
|
270
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
271
|
+
line_str, raise_for_mismatch=False)
|
|
272
|
+
if not is_payload:
|
|
273
|
+
buffer.append(line_str)
|
|
274
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
275
|
+
else:
|
|
276
|
+
buffer.append(line_str)
|
|
277
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
278
|
+
incomplete_line = b''
|
|
279
|
+
|
|
260
280
|
# Avoid checking the status too frequently to avoid overloading the
|
|
261
281
|
# DB.
|
|
262
282
|
should_check_status = (current_time -
|
|
@@ -328,16 +348,39 @@ async def _tail_log_file(
|
|
|
328
348
|
# performance but it helps avoid unnecessary heartbeat strings
|
|
329
349
|
# being printed when the client runs in an old version.
|
|
330
350
|
last_heartbeat_time = asyncio.get_event_loop().time()
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
351
|
+
|
|
352
|
+
# Combine with any incomplete line from previous chunk
|
|
353
|
+
file_chunk = incomplete_line + file_chunk
|
|
354
|
+
incomplete_line = b''
|
|
355
|
+
|
|
356
|
+
# Split chunk into lines, preserving line structure
|
|
357
|
+
lines_bytes = file_chunk.split(b'\n')
|
|
358
|
+
|
|
359
|
+
# If chunk doesn't end with newline, the last element is incomplete
|
|
360
|
+
if file_chunk and not file_chunk.endswith(b'\n'):
|
|
361
|
+
incomplete_line = lines_bytes[-1]
|
|
362
|
+
lines_bytes = lines_bytes[:-1]
|
|
363
|
+
else:
|
|
364
|
+
# If ends with \n, split creates an empty last element we should
|
|
365
|
+
# ignore
|
|
366
|
+
if lines_bytes and lines_bytes[-1] == b'':
|
|
367
|
+
lines_bytes = lines_bytes[:-1]
|
|
368
|
+
|
|
369
|
+
# Process all complete lines in this chunk
|
|
370
|
+
for line_bytes in lines_bytes:
|
|
371
|
+
# Reconstruct line with newline (since split removed it)
|
|
372
|
+
line_str = line_bytes.decode('utf-8') + '\n'
|
|
373
|
+
|
|
374
|
+
if plain_logs:
|
|
375
|
+
is_payload, line_str = message_utils.decode_payload(
|
|
376
|
+
line_str, raise_for_mismatch=False)
|
|
377
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
|
378
|
+
# sending invisible characters might be okay.
|
|
379
|
+
if is_payload:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
buffer.append(line_str)
|
|
383
|
+
buffer_bytes += len(line_str.encode('utf-8'))
|
|
341
384
|
|
|
342
385
|
# Flush remaining lines in the buffer.
|
|
343
386
|
async for chunk in flush_buffer():
|
|
@@ -373,7 +416,7 @@ def stream_response(
|
|
|
373
416
|
async def on_disconnect():
|
|
374
417
|
logger.info(f'User terminated the connection for request '
|
|
375
418
|
f'{request_id}')
|
|
376
|
-
requests_lib.
|
|
419
|
+
await requests_lib.kill_request_async(request_id)
|
|
377
420
|
|
|
378
421
|
# The background task will be run after returning a response.
|
|
379
422
|
# https://fastapi.tiangolo.com/tutorial/background-tasks/
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -49,6 +49,7 @@ install_requires = [
|
|
|
49
49
|
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
|
|
50
50
|
'pyyaml > 3.13, != 5.4.*',
|
|
51
51
|
'ijson',
|
|
52
|
+
'orjson',
|
|
52
53
|
'requests',
|
|
53
54
|
# SkyPilot inherits from uvicorn.Server to customize the behavior of
|
|
54
55
|
# uvicorn, so we need to pin uvicorn version to avoid potential break
|
|
@@ -187,6 +188,7 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
187
188
|
'docker': ['docker'] + local_ray,
|
|
188
189
|
'lambda': [], # No dependencies needed for lambda
|
|
189
190
|
'cloudflare': aws_dependencies,
|
|
191
|
+
'coreweave': aws_dependencies,
|
|
190
192
|
'scp': local_ray,
|
|
191
193
|
'oci': ['oci'],
|
|
192
194
|
# Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
|
sky/task.py
CHANGED
|
@@ -1552,6 +1552,16 @@ class Task:
|
|
|
1552
1552
|
self.update_file_mounts({
|
|
1553
1553
|
mnt_path: blob_path,
|
|
1554
1554
|
})
|
|
1555
|
+
elif store_type is storage_lib.StoreType.COREWEAVE:
|
|
1556
|
+
if storage.source is not None and not isinstance(
|
|
1557
|
+
storage.source,
|
|
1558
|
+
list) and storage.source.startswith('cw://'):
|
|
1559
|
+
blob_path = storage.source
|
|
1560
|
+
else:
|
|
1561
|
+
blob_path = 'cw://' + storage.name
|
|
1562
|
+
self.update_file_mounts({
|
|
1563
|
+
mnt_path: blob_path,
|
|
1564
|
+
})
|
|
1555
1565
|
else:
|
|
1556
1566
|
with ux_utils.print_exception_no_traceback():
|
|
1557
1567
|
raise ValueError(f'Storage Type {store_type} '
|
sky/templates/nebius-ray.yml.j2
CHANGED
|
@@ -156,6 +156,7 @@ setup_commands:
|
|
|
156
156
|
echo '{{env_var}}={{env_value}}' | sudo tee -a /etc/environment;
|
|
157
157
|
{%- endfor %}
|
|
158
158
|
{%- endif %}
|
|
159
|
+
IP=$(hostname -I | awk '{print $1}'); echo "$IP $(hostname)" | sudo tee -a /etc/hosts;
|
|
159
160
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
160
161
|
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
161
162
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/sky-cluster-key\n IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
|
|
@@ -282,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
|
|
|
282
282
|
if resources_str_full is not None:
|
|
283
283
|
resources_str = resources_str_full
|
|
284
284
|
if resources_str is None:
|
|
285
|
-
|
|
286
|
-
|
|
285
|
+
resources_str_simple, resources_str_full = (
|
|
286
|
+
resources_utils.get_readable_resources_repr(
|
|
287
|
+
handle, simplified_only=truncate))
|
|
288
|
+
if truncate:
|
|
289
|
+
resources_str = resources_str_simple
|
|
290
|
+
else:
|
|
291
|
+
assert resources_str_full is not None
|
|
292
|
+
resources_str = resources_str_full
|
|
287
293
|
|
|
288
294
|
return resources_str
|
|
289
295
|
return '-'
|