skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/runpod.py +68 -0
- sky/backends/backend_utils.py +5 -3
- sky/backends/cloud_vm_ray_backend.py +7 -2
- sky/client/cli/command.py +38 -6
- sky/client/sdk.py +22 -1
- sky/clouds/kubernetes.py +1 -1
- sky/clouds/nebius.py +4 -2
- sky/clouds/runpod.py +17 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +1 -0
- sky/dashboard/out/_next/static/chunks/{7205-88191679e7988c57.js → 1836-37fede578e2da5f8.js} +4 -9
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +6 -0
- sky/dashboard/out/_next/static/chunks/{3785.d5b86f6ebc88e6e6.js → 3785.4872a2f3aa489880.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4783.c485f48348349f47.js → 5339.3fda4a4010ff4e06.js} +4 -9
- sky/dashboard/out/_next/static/chunks/{9946.3b7b43c217ff70ec.js → 649.b9d7f7d10c1b8c53.js} +4 -9
- sky/dashboard/out/_next/static/chunks/6856-66e696640347e77b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +6 -0
- sky/dashboard/out/_next/static/chunks/9037-1c0101b86582136f.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-39c9bd4cdb7e5a57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-a0527109c2fab467.js → [cluster]-0b4b35dc1dfe046c.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-81351f95f3bec08e.js → [context]-6563820e094f68ca.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-c320641c2bcbbea6.js → infra-aabba60d57826e0f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-de06e613e20bc977.js → [name]-af76bb06dbb3954f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-be35b22e2046564c.js → workspaces-7598c33a746cdc91.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-24c4fc6d30ce0193.js +1 -0
- sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +29 -38
- sky/global_user_state.py +16 -1
- sky/jobs/state.py +1 -1
- sky/models.py +1 -0
- sky/provision/kubernetes/instance.py +10 -3
- sky/provision/runpod/__init__.py +3 -0
- sky/provision/runpod/instance.py +17 -0
- sky/provision/runpod/utils.py +23 -5
- sky/provision/runpod/volume.py +158 -0
- sky/serve/serve_state.py +1 -1
- sky/server/config.py +31 -3
- sky/server/requests/executor.py +9 -3
- sky/server/requests/payloads.py +7 -1
- sky/server/requests/preconditions.py +8 -7
- sky/server/requests/requests.py +132 -57
- sky/server/server.py +48 -38
- sky/server/stream_utils.py +14 -6
- sky/server/uvicorn.py +11 -4
- sky/skylet/constants.py +1 -1
- sky/skypilot_config.py +21 -9
- sky/ssh_node_pools/server.py +5 -5
- sky/templates/kubernetes-ray.yml.j2 +5 -5
- sky/templates/runpod-ray.yml.j2 +8 -0
- sky/users/server.py +18 -15
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/command_runner.py +4 -0
- sky/utils/db/db_utils.py +58 -1
- sky/utils/db/migration_utils.py +0 -16
- sky/utils/resource_checker.py +6 -5
- sky/utils/schemas.py +1 -1
- sky/utils/volume.py +3 -0
- sky/volumes/client/sdk.py +28 -0
- sky/volumes/server/server.py +11 -1
- sky/volumes/utils.py +117 -68
- sky/volumes/volume.py +98 -39
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/METADATA +34 -34
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/RECORD +86 -84
- sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-8089ed1e0b7e37fd.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-049014c6d43d127b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-89a84fd7fa31362d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0eaa6f7e63f51311.js +0 -1
- /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/top_level.txt +0 -0
sky/server/requests/requests.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utilities for REST API."""
|
|
2
2
|
import asyncio
|
|
3
|
+
import atexit
|
|
3
4
|
import contextlib
|
|
4
5
|
import dataclasses
|
|
5
6
|
import enum
|
|
@@ -14,7 +15,7 @@ import threading
|
|
|
14
15
|
import time
|
|
15
16
|
import traceback
|
|
16
17
|
from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
|
|
17
|
-
Optional, Tuple)
|
|
18
|
+
NamedTuple, Optional, Tuple)
|
|
18
19
|
|
|
19
20
|
import colorama
|
|
20
21
|
import filelock
|
|
@@ -300,10 +301,11 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
300
301
|
prevent killing the caller request.
|
|
301
302
|
"""
|
|
302
303
|
request_ids = [
|
|
303
|
-
request_task.request_id
|
|
304
|
+
request_task.request_id
|
|
305
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
304
306
|
cluster_names=[cluster_name],
|
|
305
307
|
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
306
|
-
exclude_request_names=[exclude_request_name])
|
|
308
|
+
exclude_request_names=[exclude_request_name]))
|
|
307
309
|
]
|
|
308
310
|
kill_requests(request_ids)
|
|
309
311
|
|
|
@@ -323,11 +325,12 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
323
325
|
"""
|
|
324
326
|
if request_ids is None:
|
|
325
327
|
request_ids = [
|
|
326
|
-
request_task.request_id
|
|
328
|
+
request_task.request_id
|
|
329
|
+
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
327
330
|
user_id=user_id,
|
|
328
331
|
status=[RequestStatus.RUNNING, RequestStatus.PENDING],
|
|
329
332
|
# Avoid cancelling the cancel request itself.
|
|
330
|
-
exclude_request_names=['sky.api_cancel'])
|
|
333
|
+
exclude_request_names=['sky.api_cancel']))
|
|
331
334
|
]
|
|
332
335
|
cancelled_request_ids = []
|
|
333
336
|
for request_id in request_ids:
|
|
@@ -548,6 +551,40 @@ async def get_request_async(request_id: str) -> Optional[Request]:
|
|
|
548
551
|
return await _get_request_no_lock_async(request_id)
|
|
549
552
|
|
|
550
553
|
|
|
554
|
+
class StatusWithMsg(NamedTuple):
|
|
555
|
+
status: RequestStatus
|
|
556
|
+
status_msg: Optional[str] = None
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
@init_db_async
|
|
560
|
+
@metrics_lib.time_me_async
|
|
561
|
+
async def get_request_status_async(
|
|
562
|
+
request_id: str,
|
|
563
|
+
include_msg: bool = False,
|
|
564
|
+
) -> Optional[StatusWithMsg]:
|
|
565
|
+
"""Get the status of a request.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
request_id: The ID of the request.
|
|
569
|
+
include_msg: Whether to include the status message.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
The status of the request. If the request is not found, returns
|
|
573
|
+
None.
|
|
574
|
+
"""
|
|
575
|
+
assert _DB is not None
|
|
576
|
+
columns = 'status'
|
|
577
|
+
if include_msg:
|
|
578
|
+
columns += ', status_msg'
|
|
579
|
+
sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
|
|
580
|
+
async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
|
|
581
|
+
if rows is None or len(rows) == 0:
|
|
582
|
+
return None
|
|
583
|
+
status = RequestStatus(rows[0][0])
|
|
584
|
+
status_msg = rows[0][1] if include_msg else None
|
|
585
|
+
return StatusWithMsg(status, status_msg)
|
|
586
|
+
|
|
587
|
+
|
|
551
588
|
@init_db
|
|
552
589
|
@metrics_lib.time_me
|
|
553
590
|
def create_if_not_exists(request: Request) -> bool:
|
|
@@ -570,17 +607,9 @@ async def create_if_not_exists_async(request: Request) -> bool:
|
|
|
570
607
|
return True
|
|
571
608
|
|
|
572
609
|
|
|
573
|
-
@
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
status: Optional[List[RequestStatus]] = None,
|
|
577
|
-
cluster_names: Optional[List[str]] = None,
|
|
578
|
-
user_id: Optional[str] = None,
|
|
579
|
-
exclude_request_names: Optional[List[str]] = None,
|
|
580
|
-
include_request_names: Optional[List[str]] = None,
|
|
581
|
-
finished_before: Optional[float] = None,
|
|
582
|
-
) -> List[Request]:
|
|
583
|
-
"""Get a list of requests that match the given filters.
|
|
610
|
+
@dataclasses.dataclass
|
|
611
|
+
class RequestTaskFilter:
|
|
612
|
+
"""Filter for requests.
|
|
584
613
|
|
|
585
614
|
Args:
|
|
586
615
|
status: a list of statuses of the requests to filter on.
|
|
@@ -598,51 +627,87 @@ def get_request_tasks(
|
|
|
598
627
|
ValueError: If both exclude_request_names and include_request_names are
|
|
599
628
|
provided.
|
|
600
629
|
"""
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
630
|
+
status: Optional[List[RequestStatus]] = None
|
|
631
|
+
cluster_names: Optional[List[str]] = None
|
|
632
|
+
user_id: Optional[str] = None
|
|
633
|
+
exclude_request_names: Optional[List[str]] = None
|
|
634
|
+
include_request_names: Optional[List[str]] = None
|
|
635
|
+
finished_before: Optional[float] = None
|
|
636
|
+
|
|
637
|
+
def __post_init__(self):
|
|
638
|
+
if (self.exclude_request_names is not None and
|
|
639
|
+
self.include_request_names is not None):
|
|
640
|
+
raise ValueError(
|
|
641
|
+
'Only one of exclude_request_names or include_request_names '
|
|
642
|
+
'can be provided, not both.')
|
|
643
|
+
|
|
644
|
+
def build_query(self) -> Tuple[str, List[Any]]:
|
|
645
|
+
"""Build the SQL query and filter parameters.
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
A tuple of (SQL, SQL parameters).
|
|
649
|
+
"""
|
|
650
|
+
filters = []
|
|
651
|
+
filter_params: List[Any] = []
|
|
652
|
+
if self.status is not None:
|
|
653
|
+
status_list_str = ','.join(
|
|
654
|
+
repr(status.value) for status in self.status)
|
|
655
|
+
filters.append(f'status IN ({status_list_str})')
|
|
656
|
+
if self.exclude_request_names is not None:
|
|
657
|
+
exclude_request_names_str = ','.join(
|
|
658
|
+
repr(name) for name in self.exclude_request_names)
|
|
659
|
+
filters.append(f'name NOT IN ({exclude_request_names_str})')
|
|
660
|
+
if self.cluster_names is not None:
|
|
661
|
+
cluster_names_str = ','.join(
|
|
662
|
+
repr(name) for name in self.cluster_names)
|
|
663
|
+
filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
|
|
664
|
+
if self.user_id is not None:
|
|
665
|
+
filters.append(f'{COL_USER_ID} = ?')
|
|
666
|
+
filter_params.append(self.user_id)
|
|
667
|
+
if self.include_request_names is not None:
|
|
668
|
+
request_names_str = ','.join(
|
|
669
|
+
repr(name) for name in self.include_request_names)
|
|
670
|
+
filters.append(f'name IN ({request_names_str})')
|
|
671
|
+
if self.finished_before is not None:
|
|
672
|
+
filters.append('finished_at < ?')
|
|
673
|
+
filter_params.append(self.finished_before)
|
|
631
674
|
filter_str = ' AND '.join(filters)
|
|
632
675
|
if filter_str:
|
|
633
676
|
filter_str = f' WHERE {filter_str}'
|
|
634
677
|
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
678
|
+
return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
|
|
679
|
+
'ORDER BY created_at DESC'), filter_params
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
@init_db
|
|
683
|
+
@metrics_lib.time_me
|
|
684
|
+
def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
|
|
685
|
+
"""Get a list of requests that match the given filters.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
req_filter: the filter to apply to the requests. Refer to
|
|
689
|
+
RequestTaskFilter for the details.
|
|
690
|
+
"""
|
|
691
|
+
assert _DB is not None
|
|
692
|
+
with _DB.conn:
|
|
693
|
+
cursor = _DB.conn.cursor()
|
|
694
|
+
cursor.execute(*req_filter.build_query())
|
|
638
695
|
rows = cursor.fetchall()
|
|
639
696
|
if rows is None:
|
|
640
697
|
return []
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
698
|
+
return [Request.from_row(row) for row in rows]
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
@init_db_async
|
|
702
|
+
@metrics_lib.time_me_async
|
|
703
|
+
async def get_request_tasks_async(
|
|
704
|
+
req_filter: RequestTaskFilter) -> List[Request]:
|
|
705
|
+
"""Async version of get_request_tasks."""
|
|
706
|
+
assert _DB is not None
|
|
707
|
+
async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
|
|
708
|
+
if not rows:
|
|
709
|
+
return []
|
|
710
|
+
return [Request.from_row(row) for row in rows]
|
|
646
711
|
|
|
647
712
|
|
|
648
713
|
@init_db_async
|
|
@@ -739,8 +804,10 @@ def clean_finished_requests_with_retention(retention_seconds: int):
|
|
|
739
804
|
retention_seconds: Requests older than this many seconds will be
|
|
740
805
|
deleted.
|
|
741
806
|
"""
|
|
742
|
-
reqs = get_request_tasks(
|
|
743
|
-
|
|
807
|
+
reqs = get_request_tasks(
|
|
808
|
+
req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
|
|
809
|
+
finished_before=time.time() -
|
|
810
|
+
retention_seconds))
|
|
744
811
|
|
|
745
812
|
subprocess_utils.run_in_parallel(
|
|
746
813
|
func=lambda req: req.log_path.unlink(missing_ok=True),
|
|
@@ -767,7 +834,7 @@ async def requests_gc_daemon():
|
|
|
767
834
|
try:
|
|
768
835
|
# Negative value disables the requests GC
|
|
769
836
|
if retention_seconds >= 0:
|
|
770
|
-
clean_finished_requests_with_retention(retention_seconds)
|
|
837
|
+
await clean_finished_requests_with_retention(retention_seconds)
|
|
771
838
|
except asyncio.CancelledError:
|
|
772
839
|
logger.info('Requests GC daemon cancelled')
|
|
773
840
|
break
|
|
@@ -776,3 +843,11 @@ async def requests_gc_daemon():
|
|
|
776
843
|
# Run the daemon at most once every hour to avoid too frequent
|
|
777
844
|
# cleanup.
|
|
778
845
|
await asyncio.sleep(max(retention_seconds, 3600))
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def _cleanup():
|
|
849
|
+
if _DB is not None:
|
|
850
|
+
asyncio.run(_DB.close())
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
atexit.register(_cleanup)
|
sky/server/server.py
CHANGED
|
@@ -71,6 +71,7 @@ from sky.utils import dag_utils
|
|
|
71
71
|
from sky.utils import perf_utils
|
|
72
72
|
from sky.utils import status_lib
|
|
73
73
|
from sky.utils import subprocess_utils
|
|
74
|
+
from sky.utils.db import db_utils
|
|
74
75
|
from sky.volumes.server import server as volumes_rest
|
|
75
76
|
from sky.workspaces import server as workspaces_rest
|
|
76
77
|
|
|
@@ -1321,10 +1322,11 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1321
1322
|
detail=f'Error creating zip file: {str(e)}')
|
|
1322
1323
|
|
|
1323
1324
|
|
|
1325
|
+
# TODO(aylei): run it asynchronously after global_user_state support async op
|
|
1324
1326
|
@app.post('/provision_logs')
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1327
|
+
def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
1328
|
+
follow: bool = True,
|
|
1329
|
+
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1328
1330
|
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1329
1331
|
# Prefer clusters table first, then cluster_history as fallback.
|
|
1330
1332
|
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
@@ -1429,27 +1431,29 @@ async def local_down(request: fastapi.Request) -> None:
|
|
|
1429
1431
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1430
1432
|
"""Gets a request with a given request ID prefix."""
|
|
1431
1433
|
while True:
|
|
1432
|
-
|
|
1433
|
-
if
|
|
1434
|
+
req_status = await requests_lib.get_request_status_async(request_id)
|
|
1435
|
+
if req_status is None:
|
|
1434
1436
|
print(f'No task with request ID {request_id}', flush=True)
|
|
1435
1437
|
raise fastapi.HTTPException(
|
|
1436
1438
|
status_code=404, detail=f'Request {request_id!r} not found')
|
|
1437
|
-
if
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
if request_error is not None:
|
|
1444
|
-
raise fastapi.HTTPException(
|
|
1445
|
-
status_code=500, detail=request_task.encode().model_dump())
|
|
1446
|
-
return request_task.encode()
|
|
1447
|
-
elif (request_task.status == requests_lib.RequestStatus.RUNNING and
|
|
1448
|
-
daemons.is_daemon_request_id(request_id)):
|
|
1449
|
-
return request_task.encode()
|
|
1439
|
+
if (req_status.status == requests_lib.RequestStatus.RUNNING and
|
|
1440
|
+
daemons.is_daemon_request_id(request_id)):
|
|
1441
|
+
# Daemon requests run forever, break without waiting for complete.
|
|
1442
|
+
break
|
|
1443
|
+
if req_status.status > requests_lib.RequestStatus.RUNNING:
|
|
1444
|
+
break
|
|
1450
1445
|
# yield control to allow other coroutines to run, sleep shortly
|
|
1451
1446
|
# to avoid storming the DB and CPU in the meantime
|
|
1452
1447
|
await asyncio.sleep(0.1)
|
|
1448
|
+
request_task = await requests_lib.get_request_async(request_id)
|
|
1449
|
+
if request_task.should_retry:
|
|
1450
|
+
raise fastapi.HTTPException(
|
|
1451
|
+
status_code=503, detail=f'Request {request_id!r} should be retried')
|
|
1452
|
+
request_error = request_task.get_error()
|
|
1453
|
+
if request_error is not None:
|
|
1454
|
+
raise fastapi.HTTPException(status_code=500,
|
|
1455
|
+
detail=request_task.encode().model_dump())
|
|
1456
|
+
return request_task.encode()
|
|
1453
1457
|
|
|
1454
1458
|
|
|
1455
1459
|
@app.get('/api/stream')
|
|
@@ -1606,10 +1610,9 @@ async def api_status(
|
|
|
1606
1610
|
requests_lib.RequestStatus.PENDING,
|
|
1607
1611
|
requests_lib.RequestStatus.RUNNING,
|
|
1608
1612
|
]
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
]
|
|
1613
|
+
request_tasks = await requests_lib.get_request_tasks_async(
|
|
1614
|
+
req_filter=requests_lib.RequestTaskFilter(status=statuses))
|
|
1615
|
+
return [r.readable_encode() for r in request_tasks]
|
|
1613
1616
|
else:
|
|
1614
1617
|
encoded_request_tasks = []
|
|
1615
1618
|
for request_id in request_ids:
|
|
@@ -1808,17 +1811,20 @@ async def gpu_metrics() -> fastapi.Response:
|
|
|
1808
1811
|
# === Internal APIs ===
|
|
1809
1812
|
@app.get('/api/completion/cluster_name')
|
|
1810
1813
|
async def complete_cluster_name(incomplete: str,) -> List[str]:
|
|
1811
|
-
return
|
|
1814
|
+
return await context_utils.to_thread(
|
|
1815
|
+
global_user_state.get_cluster_names_start_with, incomplete)
|
|
1812
1816
|
|
|
1813
1817
|
|
|
1814
1818
|
@app.get('/api/completion/storage_name')
|
|
1815
1819
|
async def complete_storage_name(incomplete: str,) -> List[str]:
|
|
1816
|
-
return
|
|
1820
|
+
return await context_utils.to_thread(
|
|
1821
|
+
global_user_state.get_storage_names_start_with, incomplete)
|
|
1817
1822
|
|
|
1818
1823
|
|
|
1819
1824
|
@app.get('/api/completion/volume_name')
|
|
1820
1825
|
async def complete_volume_name(incomplete: str,) -> List[str]:
|
|
1821
|
-
return
|
|
1826
|
+
return await context_utils.to_thread(
|
|
1827
|
+
global_user_state.get_volume_names_start_with, incomplete)
|
|
1822
1828
|
|
|
1823
1829
|
|
|
1824
1830
|
@app.get('/api/completion/api_request')
|
|
@@ -1901,13 +1907,6 @@ if __name__ == '__main__':
|
|
|
1901
1907
|
|
|
1902
1908
|
skyuvicorn.add_timestamp_prefix_for_server_logs()
|
|
1903
1909
|
|
|
1904
|
-
# Initialize global user state db
|
|
1905
|
-
global_user_state.initialize_and_get_db()
|
|
1906
|
-
# Initialize request db
|
|
1907
|
-
requests_lib.reset_db_and_logs()
|
|
1908
|
-
# Restore the server user hash
|
|
1909
|
-
_init_or_restore_server_user_hash()
|
|
1910
|
-
|
|
1911
1910
|
parser = argparse.ArgumentParser()
|
|
1912
1911
|
parser.add_argument('--host', default='127.0.0.1')
|
|
1913
1912
|
parser.add_argument('--port', default=46580, type=int)
|
|
@@ -1923,7 +1922,17 @@ if __name__ == '__main__':
|
|
|
1923
1922
|
# that it is shown only when the API server is started.
|
|
1924
1923
|
usage_lib.maybe_show_privacy_policy()
|
|
1925
1924
|
|
|
1926
|
-
|
|
1925
|
+
# Initialize global user state db
|
|
1926
|
+
db_utils.set_max_connections(1)
|
|
1927
|
+
global_user_state.initialize_and_get_db()
|
|
1928
|
+
# Initialize request db
|
|
1929
|
+
requests_lib.reset_db_and_logs()
|
|
1930
|
+
# Restore the server user hash
|
|
1931
|
+
_init_or_restore_server_user_hash()
|
|
1932
|
+
max_db_connections = global_user_state.get_max_db_connections()
|
|
1933
|
+
config = server_config.compute_server_config(cmd_args.deploy,
|
|
1934
|
+
max_db_connections)
|
|
1935
|
+
|
|
1927
1936
|
num_workers = config.num_server_workers
|
|
1928
1937
|
|
|
1929
1938
|
queue_server: Optional[multiprocessing.Process] = None
|
|
@@ -1948,11 +1957,12 @@ if __name__ == '__main__':
|
|
|
1948
1957
|
logger.info(f'Starting SkyPilot API server, workers={num_workers}')
|
|
1949
1958
|
# We don't support reload for now, since it may cause leakage of request
|
|
1950
1959
|
# workers or interrupt running requests.
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
skyuvicorn.run(
|
|
1960
|
+
uvicorn_config = uvicorn.Config('sky.server.server:app',
|
|
1961
|
+
host=cmd_args.host,
|
|
1962
|
+
port=cmd_args.port,
|
|
1963
|
+
workers=num_workers)
|
|
1964
|
+
skyuvicorn.run(uvicorn_config,
|
|
1965
|
+
max_db_connections=config.num_db_connections_per_worker)
|
|
1956
1966
|
except Exception as exc: # pylint: disable=broad-except
|
|
1957
1967
|
logger.error(f'Failed to start SkyPilot API server: '
|
|
1958
1968
|
f'{common_utils.format_exception(exc, use_bracket=True)}')
|
sky/server/stream_utils.py
CHANGED
|
@@ -75,8 +75,10 @@ async def log_streamer(request_id: Optional[str],
|
|
|
75
75
|
last_waiting_msg = ''
|
|
76
76
|
waiting_msg = (f'Waiting for {request_task.name!r} request to be '
|
|
77
77
|
f'scheduled: {request_id}')
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
req_status = request_task.status
|
|
79
|
+
req_msg = request_task.status_msg
|
|
80
|
+
while req_status < requests_lib.RequestStatus.RUNNING:
|
|
81
|
+
if req_msg is not None:
|
|
80
82
|
waiting_msg = request_task.status_msg
|
|
81
83
|
if show_request_waiting_spinner:
|
|
82
84
|
yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
|
|
@@ -91,7 +93,10 @@ async def log_streamer(request_id: Optional[str],
|
|
|
91
93
|
# polling the DB, which can be a bottleneck for high-concurrency
|
|
92
94
|
# requests.
|
|
93
95
|
await asyncio.sleep(0.1)
|
|
94
|
-
|
|
96
|
+
status_with_msg = await requests_lib.get_request_status_async(
|
|
97
|
+
request_id, include_msg=True)
|
|
98
|
+
req_status = status_with_msg.status
|
|
99
|
+
req_msg = status_with_msg.status_msg
|
|
95
100
|
if not follow:
|
|
96
101
|
break
|
|
97
102
|
if show_request_waiting_spinner:
|
|
@@ -153,10 +158,13 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
|
|
|
153
158
|
line: Optional[bytes] = await f.readline()
|
|
154
159
|
if not line:
|
|
155
160
|
if request_id is not None:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
161
|
+
req_status = await requests_lib.get_request_status_async(
|
|
162
|
+
request_id)
|
|
163
|
+
if req_status.status > requests_lib.RequestStatus.RUNNING:
|
|
164
|
+
if (req_status.status ==
|
|
159
165
|
requests_lib.RequestStatus.CANCELLED):
|
|
166
|
+
request_task = await requests_lib.get_request_async(
|
|
167
|
+
request_id)
|
|
160
168
|
if request_task.should_retry:
|
|
161
169
|
buffer.append(
|
|
162
170
|
message_utils.encode_payload(
|
sky/server/uvicorn.py
CHANGED
|
@@ -26,6 +26,7 @@ from sky.utils import context_utils
|
|
|
26
26
|
from sky.utils import env_options
|
|
27
27
|
from sky.utils import perf_utils
|
|
28
28
|
from sky.utils import subprocess_utils
|
|
29
|
+
from sky.utils.db import db_utils
|
|
29
30
|
|
|
30
31
|
logger = sky_logging.init_logger(__name__)
|
|
31
32
|
|
|
@@ -88,9 +89,12 @@ class Server(uvicorn.Server):
|
|
|
88
89
|
- Run the server process with contextually aware.
|
|
89
90
|
"""
|
|
90
91
|
|
|
91
|
-
def __init__(self,
|
|
92
|
+
def __init__(self,
|
|
93
|
+
config: uvicorn.Config,
|
|
94
|
+
max_db_connections: Optional[int] = None):
|
|
92
95
|
super().__init__(config=config)
|
|
93
96
|
self.exiting: bool = False
|
|
97
|
+
self.max_db_connections = max_db_connections
|
|
94
98
|
|
|
95
99
|
def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
|
|
96
100
|
"""Handle exit signal.
|
|
@@ -146,7 +150,8 @@ class Server(uvicorn.Server):
|
|
|
146
150
|
requests_lib.RequestStatus.PENDING,
|
|
147
151
|
requests_lib.RequestStatus.RUNNING,
|
|
148
152
|
]
|
|
149
|
-
reqs = requests_lib.get_request_tasks(
|
|
153
|
+
reqs = requests_lib.get_request_tasks(
|
|
154
|
+
req_filter=requests_lib.RequestTaskFilter(status=statuses))
|
|
150
155
|
if not reqs:
|
|
151
156
|
break
|
|
152
157
|
logger.info(f'{len(reqs)} on-going requests '
|
|
@@ -195,6 +200,8 @@ class Server(uvicorn.Server):
|
|
|
195
200
|
|
|
196
201
|
def run(self, *args, **kwargs):
|
|
197
202
|
"""Run the server process."""
|
|
203
|
+
if self.max_db_connections is not None:
|
|
204
|
+
db_utils.set_max_connections(self.max_db_connections)
|
|
198
205
|
add_timestamp_prefix_for_server_logs()
|
|
199
206
|
context_utils.hijack_sys_attrs()
|
|
200
207
|
# Use default loop policy of uvicorn (use uvloop if available).
|
|
@@ -209,14 +216,14 @@ class Server(uvicorn.Server):
|
|
|
209
216
|
asyncio.run(self.serve(*args, **kwargs))
|
|
210
217
|
|
|
211
218
|
|
|
212
|
-
def run(config: uvicorn.Config):
|
|
219
|
+
def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
|
|
213
220
|
"""Run unvicorn server."""
|
|
214
221
|
if config.reload:
|
|
215
222
|
# Reload and multi-workers are mutually exclusive
|
|
216
223
|
# in uvicorn. Since we do not use reload now, simply
|
|
217
224
|
# guard by an exception.
|
|
218
225
|
raise ValueError('Reload is not supported yet.')
|
|
219
|
-
server = Server(config=config)
|
|
226
|
+
server = Server(config=config, max_db_connections=max_db_connections)
|
|
220
227
|
try:
|
|
221
228
|
if config.workers is not None and config.workers > 1:
|
|
222
229
|
sock = config.bind_socket()
|
sky/skylet/constants.py
CHANGED
|
@@ -362,7 +362,7 @@ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
|
|
|
362
362
|
|
|
363
363
|
RCLONE_CONFIG_DIR = '~/.config/rclone'
|
|
364
364
|
RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf'
|
|
365
|
-
|
|
365
|
+
RCLONE_MOUNT_CACHED_LOG_DIR = '~/.sky/rclone_log'
|
|
366
366
|
RCLONE_CACHE_DIR = '~/.cache/rclone'
|
|
367
367
|
RCLONE_CACHE_REFRESH_INTERVAL = 10
|
|
368
368
|
|
sky/skypilot_config.py
CHANGED
|
@@ -227,7 +227,7 @@ def _get_config_from_path(path: Optional[str]) -> config_utils.Config:
|
|
|
227
227
|
return parse_and_validate_config_file(path)
|
|
228
228
|
|
|
229
229
|
|
|
230
|
-
def
|
|
230
|
+
def resolve_user_config_path() -> Optional[str]:
|
|
231
231
|
# find the user config file path, None if not resolved.
|
|
232
232
|
user_config_path = _get_config_file_path(ENV_VAR_GLOBAL_CONFIG)
|
|
233
233
|
if user_config_path:
|
|
@@ -252,7 +252,7 @@ def _resolve_user_config_path() -> Optional[str]:
|
|
|
252
252
|
|
|
253
253
|
def get_user_config() -> config_utils.Config:
|
|
254
254
|
"""Returns the user config."""
|
|
255
|
-
return _get_config_from_path(
|
|
255
|
+
return _get_config_from_path(resolve_user_config_path())
|
|
256
256
|
|
|
257
257
|
|
|
258
258
|
def _resolve_project_config_path() -> Optional[str]:
|
|
@@ -574,8 +574,13 @@ def _reload_config_as_server() -> None:
|
|
|
574
574
|
'If db config is specified, no other config is allowed')
|
|
575
575
|
logger.debug('retrieving config from database')
|
|
576
576
|
with _DB_USE_LOCK:
|
|
577
|
-
|
|
578
|
-
|
|
577
|
+
dispose_engine = False
|
|
578
|
+
if db_utils.get_max_connections() == 0:
|
|
579
|
+
dispose_engine = True
|
|
580
|
+
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
581
|
+
poolclass=NullPool)
|
|
582
|
+
else:
|
|
583
|
+
sqlalchemy_engine = db_utils.get_engine('config')
|
|
579
584
|
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
|
|
580
585
|
sqlalchemy_engine)
|
|
581
586
|
|
|
@@ -597,7 +602,8 @@ def _reload_config_as_server() -> None:
|
|
|
597
602
|
server_config = overlay_skypilot_config(server_config,
|
|
598
603
|
db_config)
|
|
599
604
|
# Close the engine to avoid connection leaks
|
|
600
|
-
|
|
605
|
+
if dispose_engine:
|
|
606
|
+
sqlalchemy_engine.dispose()
|
|
601
607
|
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
602
608
|
logger.debug(f'server config: \n'
|
|
603
609
|
f'{yaml_utils.dump_yaml_str(dict(server_config))}')
|
|
@@ -611,7 +617,7 @@ def _reload_config_as_client() -> None:
|
|
|
611
617
|
_set_loaded_config_path(None)
|
|
612
618
|
|
|
613
619
|
overrides: List[config_utils.Config] = []
|
|
614
|
-
user_config_path =
|
|
620
|
+
user_config_path = resolve_user_config_path()
|
|
615
621
|
user_config = _get_config_from_path(user_config_path)
|
|
616
622
|
if user_config:
|
|
617
623
|
overrides.append(user_config)
|
|
@@ -867,8 +873,13 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
867
873
|
raise ValueError('Cannot change db url while server is running')
|
|
868
874
|
if existing_db_url:
|
|
869
875
|
with _DB_USE_LOCK:
|
|
870
|
-
|
|
871
|
-
|
|
876
|
+
dispose_engine = False
|
|
877
|
+
if db_utils.get_max_connections() == 0:
|
|
878
|
+
dispose_engine = True
|
|
879
|
+
sqlalchemy_engine = sqlalchemy.create_engine(
|
|
880
|
+
existing_db_url, poolclass=NullPool)
|
|
881
|
+
else:
|
|
882
|
+
sqlalchemy_engine = db_utils.get_engine('config')
|
|
872
883
|
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
873
884
|
Base.metadata, sqlalchemy_engine)
|
|
874
885
|
|
|
@@ -897,7 +908,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
897
908
|
_set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
|
|
898
909
|
db_updated = True
|
|
899
910
|
# Close the engine to avoid connection leaks
|
|
900
|
-
|
|
911
|
+
if dispose_engine:
|
|
912
|
+
sqlalchemy_engine.dispose()
|
|
901
913
|
|
|
902
914
|
if not db_updated:
|
|
903
915
|
# save to the local file (PVC in Kubernetes, local file otherwise)
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -15,7 +15,7 @@ router = fastapi.APIRouter()
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@router.get('')
|
|
18
|
-
|
|
18
|
+
def get_ssh_node_pools() -> Dict[str, Any]:
|
|
19
19
|
"""Get all SSH Node Pool configurations."""
|
|
20
20
|
try:
|
|
21
21
|
return ssh_node_pools_core.get_all_pools()
|
|
@@ -27,7 +27,7 @@ async def get_ssh_node_pools() -> Dict[str, Any]:
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
@router.post('')
|
|
30
|
-
|
|
30
|
+
def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
31
31
|
"""Update SSH Node Pool configurations."""
|
|
32
32
|
try:
|
|
33
33
|
ssh_node_pools_core.update_pools(pools_config)
|
|
@@ -39,7 +39,7 @@ async def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
@router.delete('/{pool_name}')
|
|
42
|
-
|
|
42
|
+
def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
|
|
43
43
|
"""Delete a SSH Node Pool configuration."""
|
|
44
44
|
try:
|
|
45
45
|
if ssh_node_pools_core.delete_pool(pool_name):
|
|
@@ -83,7 +83,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
|
|
83
83
|
|
|
84
84
|
|
|
85
85
|
@router.get('/keys')
|
|
86
|
-
|
|
86
|
+
def list_ssh_keys() -> List[str]:
|
|
87
87
|
"""List available SSH keys."""
|
|
88
88
|
try:
|
|
89
89
|
return ssh_node_pools_core.list_ssh_keys()
|
|
@@ -200,7 +200,7 @@ async def down_ssh_node_pool_general(
|
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
@router.get('/{pool_name}/status')
|
|
203
|
-
|
|
203
|
+
def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
|
|
204
204
|
"""Get the status of a specific SSH Node Pool."""
|
|
205
205
|
try:
|
|
206
206
|
# Call ssh_status to check the context
|