skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/runpod.py +68 -0
  3. sky/backends/backend_utils.py +5 -3
  4. sky/backends/cloud_vm_ray_backend.py +7 -2
  5. sky/client/cli/command.py +38 -6
  6. sky/client/sdk.py +22 -1
  7. sky/clouds/kubernetes.py +1 -1
  8. sky/clouds/nebius.py +4 -2
  9. sky/clouds/runpod.py +17 -0
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/{7205-88191679e7988c57.js → 1836-37fede578e2da5f8.js} +4 -9
  13. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/{3785.d5b86f6ebc88e6e6.js → 3785.4872a2f3aa489880.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/{4783.c485f48348349f47.js → 5339.3fda4a4010ff4e06.js} +4 -9
  17. sky/dashboard/out/_next/static/chunks/{9946.3b7b43c217ff70ec.js → 649.b9d7f7d10c1b8c53.js} +4 -9
  18. sky/dashboard/out/_next/static/chunks/6856-66e696640347e77b.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/9037-1c0101b86582136f.js +6 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-39c9bd4cdb7e5a57.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-a0527109c2fab467.js → [cluster]-0b4b35dc1dfe046c.js} +2 -7
  23. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-81351f95f3bec08e.js → [context]-6563820e094f68ca.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{infra-c320641c2bcbbea6.js → infra-aabba60d57826e0f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-de06e613e20bc977.js → [name]-af76bb06dbb3954f.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/{workspaces-be35b22e2046564c.js → workspaces-7598c33a746cdc91.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/webpack-24c4fc6d30ce0193.js +1 -0
  29. sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_buildManifest.js +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/mounting_utils.py +29 -38
  46. sky/global_user_state.py +16 -1
  47. sky/jobs/state.py +1 -1
  48. sky/models.py +1 -0
  49. sky/provision/kubernetes/instance.py +10 -3
  50. sky/provision/runpod/__init__.py +3 -0
  51. sky/provision/runpod/instance.py +17 -0
  52. sky/provision/runpod/utils.py +23 -5
  53. sky/provision/runpod/volume.py +158 -0
  54. sky/serve/serve_state.py +1 -1
  55. sky/server/config.py +31 -3
  56. sky/server/requests/executor.py +9 -3
  57. sky/server/requests/payloads.py +7 -1
  58. sky/server/requests/preconditions.py +8 -7
  59. sky/server/requests/requests.py +132 -57
  60. sky/server/server.py +48 -38
  61. sky/server/stream_utils.py +14 -6
  62. sky/server/uvicorn.py +11 -4
  63. sky/skylet/constants.py +1 -1
  64. sky/skypilot_config.py +21 -9
  65. sky/ssh_node_pools/server.py +5 -5
  66. sky/templates/kubernetes-ray.yml.j2 +5 -5
  67. sky/templates/runpod-ray.yml.j2 +8 -0
  68. sky/users/server.py +18 -15
  69. sky/utils/benchmark_utils.py +60 -0
  70. sky/utils/command_runner.py +4 -0
  71. sky/utils/db/db_utils.py +58 -1
  72. sky/utils/db/migration_utils.py +0 -16
  73. sky/utils/resource_checker.py +6 -5
  74. sky/utils/schemas.py +1 -1
  75. sky/utils/volume.py +3 -0
  76. sky/volumes/client/sdk.py +28 -0
  77. sky/volumes/server/server.py +11 -1
  78. sky/volumes/utils.py +117 -68
  79. sky/volumes/volume.py +98 -39
  80. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/METADATA +34 -34
  81. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/RECORD +86 -84
  82. sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/3015-8089ed1e0b7e37fd.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-049014c6d43d127b.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +0 -6
  86. sky/dashboard/out/_next/static/chunks/9037-89a84fd7fa31362d.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +0 -6
  88. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
  89. sky/dashboard/out/_next/static/chunks/pages/jobs-7421e63ac35f8fce.js +0 -1
  90. sky/dashboard/out/_next/static/chunks/webpack-0eaa6f7e63f51311.js +0 -1
  91. /sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → mriHUOVL_Ht-CeW-e7saa}/_ssgManifest.js +0 -0
  92. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250904.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  """Utilities for REST API."""
2
2
  import asyncio
3
+ import atexit
3
4
  import contextlib
4
5
  import dataclasses
5
6
  import enum
@@ -14,7 +15,7 @@ import threading
14
15
  import time
15
16
  import traceback
16
17
  from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
17
- Optional, Tuple)
18
+ NamedTuple, Optional, Tuple)
18
19
 
19
20
  import colorama
20
21
  import filelock
@@ -300,10 +301,11 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
300
301
  prevent killing the caller request.
301
302
  """
302
303
  request_ids = [
303
- request_task.request_id for request_task in get_request_tasks(
304
+ request_task.request_id
305
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
304
306
  cluster_names=[cluster_name],
305
307
  status=[RequestStatus.PENDING, RequestStatus.RUNNING],
306
- exclude_request_names=[exclude_request_name])
308
+ exclude_request_names=[exclude_request_name]))
307
309
  ]
308
310
  kill_requests(request_ids)
309
311
 
@@ -323,11 +325,12 @@ def kill_requests(request_ids: Optional[List[str]] = None,
323
325
  """
324
326
  if request_ids is None:
325
327
  request_ids = [
326
- request_task.request_id for request_task in get_request_tasks(
328
+ request_task.request_id
329
+ for request_task in get_request_tasks(req_filter=RequestTaskFilter(
327
330
  user_id=user_id,
328
331
  status=[RequestStatus.RUNNING, RequestStatus.PENDING],
329
332
  # Avoid cancelling the cancel request itself.
330
- exclude_request_names=['sky.api_cancel'])
333
+ exclude_request_names=['sky.api_cancel']))
331
334
  ]
332
335
  cancelled_request_ids = []
333
336
  for request_id in request_ids:
@@ -548,6 +551,40 @@ async def get_request_async(request_id: str) -> Optional[Request]:
548
551
  return await _get_request_no_lock_async(request_id)
549
552
 
550
553
 
554
+ class StatusWithMsg(NamedTuple):
555
+ status: RequestStatus
556
+ status_msg: Optional[str] = None
557
+
558
+
559
+ @init_db_async
560
+ @metrics_lib.time_me_async
561
+ async def get_request_status_async(
562
+ request_id: str,
563
+ include_msg: bool = False,
564
+ ) -> Optional[StatusWithMsg]:
565
+ """Get the status of a request.
566
+
567
+ Args:
568
+ request_id: The ID of the request.
569
+ include_msg: Whether to include the status message.
570
+
571
+ Returns:
572
+ The status of the request. If the request is not found, returns
573
+ None.
574
+ """
575
+ assert _DB is not None
576
+ columns = 'status'
577
+ if include_msg:
578
+ columns += ', status_msg'
579
+ sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
580
+ async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
581
+ if rows is None or len(rows) == 0:
582
+ return None
583
+ status = RequestStatus(rows[0][0])
584
+ status_msg = rows[0][1] if include_msg else None
585
+ return StatusWithMsg(status, status_msg)
586
+
587
+
551
588
  @init_db
552
589
  @metrics_lib.time_me
553
590
  def create_if_not_exists(request: Request) -> bool:
@@ -570,17 +607,9 @@ async def create_if_not_exists_async(request: Request) -> bool:
570
607
  return True
571
608
 
572
609
 
573
- @init_db
574
- @metrics_lib.time_me
575
- def get_request_tasks(
576
- status: Optional[List[RequestStatus]] = None,
577
- cluster_names: Optional[List[str]] = None,
578
- user_id: Optional[str] = None,
579
- exclude_request_names: Optional[List[str]] = None,
580
- include_request_names: Optional[List[str]] = None,
581
- finished_before: Optional[float] = None,
582
- ) -> List[Request]:
583
- """Get a list of requests that match the given filters.
610
+ @dataclasses.dataclass
611
+ class RequestTaskFilter:
612
+ """Filter for requests.
584
613
 
585
614
  Args:
586
615
  status: a list of statuses of the requests to filter on.
@@ -598,51 +627,87 @@ def get_request_tasks(
598
627
  ValueError: If both exclude_request_names and include_request_names are
599
628
  provided.
600
629
  """
601
- if exclude_request_names is not None and include_request_names is not None:
602
- raise ValueError(
603
- 'Only one of exclude_request_names or include_request_names can be '
604
- 'provided, not both.')
605
-
606
- filters = []
607
- filter_params: List[Any] = []
608
- if status is not None:
609
- status_list_str = ','.join(repr(status.value) for status in status)
610
- filters.append(f'status IN ({status_list_str})')
611
- if exclude_request_names is not None:
612
- exclude_request_names_str = ','.join(
613
- repr(name) for name in exclude_request_names)
614
- filters.append(f'name NOT IN ({exclude_request_names_str})')
615
- if cluster_names is not None:
616
- cluster_names_str = ','.join(repr(name) for name in cluster_names)
617
- filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
618
- if user_id is not None:
619
- filters.append(f'{COL_USER_ID} = ?')
620
- filter_params.append(user_id)
621
- if include_request_names is not None:
622
- request_names_str = ','.join(
623
- repr(name) for name in include_request_names)
624
- filters.append(f'name IN ({request_names_str})')
625
- if finished_before is not None:
626
- filters.append('finished_at < ?')
627
- filter_params.append(finished_before)
628
- assert _DB is not None
629
- with _DB.conn:
630
- cursor = _DB.conn.cursor()
630
+ status: Optional[List[RequestStatus]] = None
631
+ cluster_names: Optional[List[str]] = None
632
+ user_id: Optional[str] = None
633
+ exclude_request_names: Optional[List[str]] = None
634
+ include_request_names: Optional[List[str]] = None
635
+ finished_before: Optional[float] = None
636
+
637
+ def __post_init__(self):
638
+ if (self.exclude_request_names is not None and
639
+ self.include_request_names is not None):
640
+ raise ValueError(
641
+ 'Only one of exclude_request_names or include_request_names '
642
+ 'can be provided, not both.')
643
+
644
+ def build_query(self) -> Tuple[str, List[Any]]:
645
+ """Build the SQL query and filter parameters.
646
+
647
+ Returns:
648
+ A tuple of (SQL, SQL parameters).
649
+ """
650
+ filters = []
651
+ filter_params: List[Any] = []
652
+ if self.status is not None:
653
+ status_list_str = ','.join(
654
+ repr(status.value) for status in self.status)
655
+ filters.append(f'status IN ({status_list_str})')
656
+ if self.exclude_request_names is not None:
657
+ exclude_request_names_str = ','.join(
658
+ repr(name) for name in self.exclude_request_names)
659
+ filters.append(f'name NOT IN ({exclude_request_names_str})')
660
+ if self.cluster_names is not None:
661
+ cluster_names_str = ','.join(
662
+ repr(name) for name in self.cluster_names)
663
+ filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
664
+ if self.user_id is not None:
665
+ filters.append(f'{COL_USER_ID} = ?')
666
+ filter_params.append(self.user_id)
667
+ if self.include_request_names is not None:
668
+ request_names_str = ','.join(
669
+ repr(name) for name in self.include_request_names)
670
+ filters.append(f'name IN ({request_names_str})')
671
+ if self.finished_before is not None:
672
+ filters.append('finished_at < ?')
673
+ filter_params.append(self.finished_before)
631
674
  filter_str = ' AND '.join(filters)
632
675
  if filter_str:
633
676
  filter_str = f' WHERE {filter_str}'
634
677
  columns_str = ', '.join(REQUEST_COLUMNS)
635
- cursor.execute(
636
- f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
637
- 'ORDER BY created_at DESC', filter_params)
678
+ return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
679
+ 'ORDER BY created_at DESC'), filter_params
680
+
681
+
682
+ @init_db
683
+ @metrics_lib.time_me
684
+ def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
685
+ """Get a list of requests that match the given filters.
686
+
687
+ Args:
688
+ req_filter: the filter to apply to the requests. Refer to
689
+ RequestTaskFilter for the details.
690
+ """
691
+ assert _DB is not None
692
+ with _DB.conn:
693
+ cursor = _DB.conn.cursor()
694
+ cursor.execute(*req_filter.build_query())
638
695
  rows = cursor.fetchall()
639
696
  if rows is None:
640
697
  return []
641
- requests = []
642
- for row in rows:
643
- request = Request.from_row(row)
644
- requests.append(request)
645
- return requests
698
+ return [Request.from_row(row) for row in rows]
699
+
700
+
701
+ @init_db_async
702
+ @metrics_lib.time_me_async
703
+ async def get_request_tasks_async(
704
+ req_filter: RequestTaskFilter) -> List[Request]:
705
+ """Async version of get_request_tasks."""
706
+ assert _DB is not None
707
+ async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
708
+ if not rows:
709
+ return []
710
+ return [Request.from_row(row) for row in rows]
646
711
 
647
712
 
648
713
  @init_db_async
@@ -739,8 +804,10 @@ def clean_finished_requests_with_retention(retention_seconds: int):
739
804
  retention_seconds: Requests older than this many seconds will be
740
805
  deleted.
741
806
  """
742
- reqs = get_request_tasks(status=RequestStatus.finished_status(),
743
- finished_before=time.time() - retention_seconds)
807
+ reqs = get_request_tasks(
808
+ req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
809
+ finished_before=time.time() -
810
+ retention_seconds))
744
811
 
745
812
  subprocess_utils.run_in_parallel(
746
813
  func=lambda req: req.log_path.unlink(missing_ok=True),
@@ -767,7 +834,7 @@ async def requests_gc_daemon():
767
834
  try:
768
835
  # Negative value disables the requests GC
769
836
  if retention_seconds >= 0:
770
- clean_finished_requests_with_retention(retention_seconds)
837
+ await clean_finished_requests_with_retention(retention_seconds)
771
838
  except asyncio.CancelledError:
772
839
  logger.info('Requests GC daemon cancelled')
773
840
  break
@@ -776,3 +843,11 @@ async def requests_gc_daemon():
776
843
  # Run the daemon at most once every hour to avoid too frequent
777
844
  # cleanup.
778
845
  await asyncio.sleep(max(retention_seconds, 3600))
846
+
847
+
848
+ def _cleanup():
849
+ if _DB is not None:
850
+ asyncio.run(_DB.close())
851
+
852
+
853
+ atexit.register(_cleanup)
sky/server/server.py CHANGED
@@ -71,6 +71,7 @@ from sky.utils import dag_utils
71
71
  from sky.utils import perf_utils
72
72
  from sky.utils import status_lib
73
73
  from sky.utils import subprocess_utils
74
+ from sky.utils.db import db_utils
74
75
  from sky.volumes.server import server as volumes_rest
75
76
  from sky.workspaces import server as workspaces_rest
76
77
 
@@ -1321,10 +1322,11 @@ async def download(download_body: payloads.DownloadBody,
1321
1322
  detail=f'Error creating zip file: {str(e)}')
1322
1323
 
1323
1324
 
1325
+ # TODO(aylei): run it asynchronously after global_user_state support async op
1324
1326
  @app.post('/provision_logs')
1325
- async def provision_logs(cluster_body: payloads.ClusterNameBody,
1326
- follow: bool = True,
1327
- tail: int = 0) -> fastapi.responses.StreamingResponse:
1327
+ def provision_logs(cluster_body: payloads.ClusterNameBody,
1328
+ follow: bool = True,
1329
+ tail: int = 0) -> fastapi.responses.StreamingResponse:
1328
1330
  """Streams the provision.log for the latest launch request of a cluster."""
1329
1331
  # Prefer clusters table first, then cluster_history as fallback.
1330
1332
  log_path_str = global_user_state.get_cluster_provision_log_path(
@@ -1429,27 +1431,29 @@ async def local_down(request: fastapi.Request) -> None:
1429
1431
  async def api_get(request_id: str) -> payloads.RequestPayload:
1430
1432
  """Gets a request with a given request ID prefix."""
1431
1433
  while True:
1432
- request_task = await requests_lib.get_request_async(request_id)
1433
- if request_task is None:
1434
+ req_status = await requests_lib.get_request_status_async(request_id)
1435
+ if req_status is None:
1434
1436
  print(f'No task with request ID {request_id}', flush=True)
1435
1437
  raise fastapi.HTTPException(
1436
1438
  status_code=404, detail=f'Request {request_id!r} not found')
1437
- if request_task.status > requests_lib.RequestStatus.RUNNING:
1438
- if request_task.should_retry:
1439
- raise fastapi.HTTPException(
1440
- status_code=503,
1441
- detail=f'Request {request_id!r} should be retried')
1442
- request_error = request_task.get_error()
1443
- if request_error is not None:
1444
- raise fastapi.HTTPException(
1445
- status_code=500, detail=request_task.encode().model_dump())
1446
- return request_task.encode()
1447
- elif (request_task.status == requests_lib.RequestStatus.RUNNING and
1448
- daemons.is_daemon_request_id(request_id)):
1449
- return request_task.encode()
1439
+ if (req_status.status == requests_lib.RequestStatus.RUNNING and
1440
+ daemons.is_daemon_request_id(request_id)):
1441
+ # Daemon requests run forever, break without waiting for complete.
1442
+ break
1443
+ if req_status.status > requests_lib.RequestStatus.RUNNING:
1444
+ break
1450
1445
  # yield control to allow other coroutines to run, sleep shortly
1451
1446
  # to avoid storming the DB and CPU in the meantime
1452
1447
  await asyncio.sleep(0.1)
1448
+ request_task = await requests_lib.get_request_async(request_id)
1449
+ if request_task.should_retry:
1450
+ raise fastapi.HTTPException(
1451
+ status_code=503, detail=f'Request {request_id!r} should be retried')
1452
+ request_error = request_task.get_error()
1453
+ if request_error is not None:
1454
+ raise fastapi.HTTPException(status_code=500,
1455
+ detail=request_task.encode().model_dump())
1456
+ return request_task.encode()
1453
1457
 
1454
1458
 
1455
1459
  @app.get('/api/stream')
@@ -1606,10 +1610,9 @@ async def api_status(
1606
1610
  requests_lib.RequestStatus.PENDING,
1607
1611
  requests_lib.RequestStatus.RUNNING,
1608
1612
  ]
1609
- return [
1610
- request_task.readable_encode()
1611
- for request_task in requests_lib.get_request_tasks(status=statuses)
1612
- ]
1613
+ request_tasks = await requests_lib.get_request_tasks_async(
1614
+ req_filter=requests_lib.RequestTaskFilter(status=statuses))
1615
+ return [r.readable_encode() for r in request_tasks]
1613
1616
  else:
1614
1617
  encoded_request_tasks = []
1615
1618
  for request_id in request_ids:
@@ -1808,17 +1811,20 @@ async def gpu_metrics() -> fastapi.Response:
1808
1811
  # === Internal APIs ===
1809
1812
  @app.get('/api/completion/cluster_name')
1810
1813
  async def complete_cluster_name(incomplete: str,) -> List[str]:
1811
- return global_user_state.get_cluster_names_start_with(incomplete)
1814
+ return await context_utils.to_thread(
1815
+ global_user_state.get_cluster_names_start_with, incomplete)
1812
1816
 
1813
1817
 
1814
1818
  @app.get('/api/completion/storage_name')
1815
1819
  async def complete_storage_name(incomplete: str,) -> List[str]:
1816
- return global_user_state.get_storage_names_start_with(incomplete)
1820
+ return await context_utils.to_thread(
1821
+ global_user_state.get_storage_names_start_with, incomplete)
1817
1822
 
1818
1823
 
1819
1824
  @app.get('/api/completion/volume_name')
1820
1825
  async def complete_volume_name(incomplete: str,) -> List[str]:
1821
- return global_user_state.get_volume_names_start_with(incomplete)
1826
+ return await context_utils.to_thread(
1827
+ global_user_state.get_volume_names_start_with, incomplete)
1822
1828
 
1823
1829
 
1824
1830
  @app.get('/api/completion/api_request')
@@ -1901,13 +1907,6 @@ if __name__ == '__main__':
1901
1907
 
1902
1908
  skyuvicorn.add_timestamp_prefix_for_server_logs()
1903
1909
 
1904
- # Initialize global user state db
1905
- global_user_state.initialize_and_get_db()
1906
- # Initialize request db
1907
- requests_lib.reset_db_and_logs()
1908
- # Restore the server user hash
1909
- _init_or_restore_server_user_hash()
1910
-
1911
1910
  parser = argparse.ArgumentParser()
1912
1911
  parser.add_argument('--host', default='127.0.0.1')
1913
1912
  parser.add_argument('--port', default=46580, type=int)
@@ -1923,7 +1922,17 @@ if __name__ == '__main__':
1923
1922
  # that it is shown only when the API server is started.
1924
1923
  usage_lib.maybe_show_privacy_policy()
1925
1924
 
1926
- config = server_config.compute_server_config(cmd_args.deploy)
1925
+ # Initialize global user state db
1926
+ db_utils.set_max_connections(1)
1927
+ global_user_state.initialize_and_get_db()
1928
+ # Initialize request db
1929
+ requests_lib.reset_db_and_logs()
1930
+ # Restore the server user hash
1931
+ _init_or_restore_server_user_hash()
1932
+ max_db_connections = global_user_state.get_max_db_connections()
1933
+ config = server_config.compute_server_config(cmd_args.deploy,
1934
+ max_db_connections)
1935
+
1927
1936
  num_workers = config.num_server_workers
1928
1937
 
1929
1938
  queue_server: Optional[multiprocessing.Process] = None
@@ -1948,11 +1957,12 @@ if __name__ == '__main__':
1948
1957
  logger.info(f'Starting SkyPilot API server, workers={num_workers}')
1949
1958
  # We don't support reload for now, since it may cause leakage of request
1950
1959
  # workers or interrupt running requests.
1951
- config = uvicorn.Config('sky.server.server:app',
1952
- host=cmd_args.host,
1953
- port=cmd_args.port,
1954
- workers=num_workers)
1955
- skyuvicorn.run(config)
1960
+ uvicorn_config = uvicorn.Config('sky.server.server:app',
1961
+ host=cmd_args.host,
1962
+ port=cmd_args.port,
1963
+ workers=num_workers)
1964
+ skyuvicorn.run(uvicorn_config,
1965
+ max_db_connections=config.num_db_connections_per_worker)
1956
1966
  except Exception as exc: # pylint: disable=broad-except
1957
1967
  logger.error(f'Failed to start SkyPilot API server: '
1958
1968
  f'{common_utils.format_exception(exc, use_bracket=True)}')
@@ -75,8 +75,10 @@ async def log_streamer(request_id: Optional[str],
75
75
  last_waiting_msg = ''
76
76
  waiting_msg = (f'Waiting for {request_task.name!r} request to be '
77
77
  f'scheduled: {request_id}')
78
- while request_task.status < requests_lib.RequestStatus.RUNNING:
79
- if request_task.status_msg is not None:
78
+ req_status = request_task.status
79
+ req_msg = request_task.status_msg
80
+ while req_status < requests_lib.RequestStatus.RUNNING:
81
+ if req_msg is not None:
80
82
  waiting_msg = request_task.status_msg
81
83
  if show_request_waiting_spinner:
82
84
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
@@ -91,7 +93,10 @@ async def log_streamer(request_id: Optional[str],
91
93
  # polling the DB, which can be a bottleneck for high-concurrency
92
94
  # requests.
93
95
  await asyncio.sleep(0.1)
94
- request_task = await requests_lib.get_request_async(request_id)
96
+ status_with_msg = await requests_lib.get_request_status_async(
97
+ request_id, include_msg=True)
98
+ req_status = status_with_msg.status
99
+ req_msg = status_with_msg.status_msg
95
100
  if not follow:
96
101
  break
97
102
  if show_request_waiting_spinner:
@@ -153,10 +158,13 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
153
158
  line: Optional[bytes] = await f.readline()
154
159
  if not line:
155
160
  if request_id is not None:
156
- request_task = await requests_lib.get_request_async(request_id)
157
- if request_task.status > requests_lib.RequestStatus.RUNNING:
158
- if (request_task.status ==
161
+ req_status = await requests_lib.get_request_status_async(
162
+ request_id)
163
+ if req_status.status > requests_lib.RequestStatus.RUNNING:
164
+ if (req_status.status ==
159
165
  requests_lib.RequestStatus.CANCELLED):
166
+ request_task = await requests_lib.get_request_async(
167
+ request_id)
160
168
  if request_task.should_retry:
161
169
  buffer.append(
162
170
  message_utils.encode_payload(
sky/server/uvicorn.py CHANGED
@@ -26,6 +26,7 @@ from sky.utils import context_utils
26
26
  from sky.utils import env_options
27
27
  from sky.utils import perf_utils
28
28
  from sky.utils import subprocess_utils
29
+ from sky.utils.db import db_utils
29
30
 
30
31
  logger = sky_logging.init_logger(__name__)
31
32
 
@@ -88,9 +89,12 @@ class Server(uvicorn.Server):
88
89
  - Run the server process with contextually aware.
89
90
  """
90
91
 
91
- def __init__(self, config: uvicorn.Config):
92
+ def __init__(self,
93
+ config: uvicorn.Config,
94
+ max_db_connections: Optional[int] = None):
92
95
  super().__init__(config=config)
93
96
  self.exiting: bool = False
97
+ self.max_db_connections = max_db_connections
94
98
 
95
99
  def handle_exit(self, sig: int, frame: Union[FrameType, None]) -> None:
96
100
  """Handle exit signal.
@@ -146,7 +150,8 @@ class Server(uvicorn.Server):
146
150
  requests_lib.RequestStatus.PENDING,
147
151
  requests_lib.RequestStatus.RUNNING,
148
152
  ]
149
- reqs = requests_lib.get_request_tasks(status=statuses)
153
+ reqs = requests_lib.get_request_tasks(
154
+ req_filter=requests_lib.RequestTaskFilter(status=statuses))
150
155
  if not reqs:
151
156
  break
152
157
  logger.info(f'{len(reqs)} on-going requests '
@@ -195,6 +200,8 @@ class Server(uvicorn.Server):
195
200
 
196
201
  def run(self, *args, **kwargs):
197
202
  """Run the server process."""
203
+ if self.max_db_connections is not None:
204
+ db_utils.set_max_connections(self.max_db_connections)
198
205
  add_timestamp_prefix_for_server_logs()
199
206
  context_utils.hijack_sys_attrs()
200
207
  # Use default loop policy of uvicorn (use uvloop if available).
@@ -209,14 +216,14 @@ class Server(uvicorn.Server):
209
216
  asyncio.run(self.serve(*args, **kwargs))
210
217
 
211
218
 
212
- def run(config: uvicorn.Config):
219
+ def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
213
220
  """Run unvicorn server."""
214
221
  if config.reload:
215
222
  # Reload and multi-workers are mutually exclusive
216
223
  # in uvicorn. Since we do not use reload now, simply
217
224
  # guard by an exception.
218
225
  raise ValueError('Reload is not supported yet.')
219
- server = Server(config=config)
226
+ server = Server(config=config, max_db_connections=max_db_connections)
220
227
  try:
221
228
  if config.workers is not None and config.workers > 1:
222
229
  sock = config.bind_socket()
sky/skylet/constants.py CHANGED
@@ -362,7 +362,7 @@ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
362
362
 
363
363
  RCLONE_CONFIG_DIR = '~/.config/rclone'
364
364
  RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf'
365
- RCLONE_LOG_DIR = '~/.sky/rclone_log'
365
+ RCLONE_MOUNT_CACHED_LOG_DIR = '~/.sky/rclone_log'
366
366
  RCLONE_CACHE_DIR = '~/.cache/rclone'
367
367
  RCLONE_CACHE_REFRESH_INTERVAL = 10
368
368
 
sky/skypilot_config.py CHANGED
@@ -227,7 +227,7 @@ def _get_config_from_path(path: Optional[str]) -> config_utils.Config:
227
227
  return parse_and_validate_config_file(path)
228
228
 
229
229
 
230
- def _resolve_user_config_path() -> Optional[str]:
230
+ def resolve_user_config_path() -> Optional[str]:
231
231
  # find the user config file path, None if not resolved.
232
232
  user_config_path = _get_config_file_path(ENV_VAR_GLOBAL_CONFIG)
233
233
  if user_config_path:
@@ -252,7 +252,7 @@ def _resolve_user_config_path() -> Optional[str]:
252
252
 
253
253
  def get_user_config() -> config_utils.Config:
254
254
  """Returns the user config."""
255
- return _get_config_from_path(_resolve_user_config_path())
255
+ return _get_config_from_path(resolve_user_config_path())
256
256
 
257
257
 
258
258
  def _resolve_project_config_path() -> Optional[str]:
@@ -574,8 +574,13 @@ def _reload_config_as_server() -> None:
574
574
  'If db config is specified, no other config is allowed')
575
575
  logger.debug('retrieving config from database')
576
576
  with _DB_USE_LOCK:
577
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
578
- poolclass=NullPool)
577
+ dispose_engine = False
578
+ if db_utils.get_max_connections() == 0:
579
+ dispose_engine = True
580
+ sqlalchemy_engine = sqlalchemy.create_engine(db_url,
581
+ poolclass=NullPool)
582
+ else:
583
+ sqlalchemy_engine = db_utils.get_engine('config')
579
584
  db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
580
585
  sqlalchemy_engine)
581
586
 
@@ -597,7 +602,8 @@ def _reload_config_as_server() -> None:
597
602
  server_config = overlay_skypilot_config(server_config,
598
603
  db_config)
599
604
  # Close the engine to avoid connection leaks
600
- sqlalchemy_engine.dispose()
605
+ if dispose_engine:
606
+ sqlalchemy_engine.dispose()
601
607
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
602
608
  logger.debug(f'server config: \n'
603
609
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -611,7 +617,7 @@ def _reload_config_as_client() -> None:
611
617
  _set_loaded_config_path(None)
612
618
 
613
619
  overrides: List[config_utils.Config] = []
614
- user_config_path = _resolve_user_config_path()
620
+ user_config_path = resolve_user_config_path()
615
621
  user_config = _get_config_from_path(user_config_path)
616
622
  if user_config:
617
623
  overrides.append(user_config)
@@ -867,8 +873,13 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
867
873
  raise ValueError('Cannot change db url while server is running')
868
874
  if existing_db_url:
869
875
  with _DB_USE_LOCK:
870
- sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
871
- poolclass=NullPool)
876
+ dispose_engine = False
877
+ if db_utils.get_max_connections() == 0:
878
+ dispose_engine = True
879
+ sqlalchemy_engine = sqlalchemy.create_engine(
880
+ existing_db_url, poolclass=NullPool)
881
+ else:
882
+ sqlalchemy_engine = db_utils.get_engine('config')
872
883
  db_utils.add_all_tables_to_db_sqlalchemy(
873
884
  Base.metadata, sqlalchemy_engine)
874
885
 
@@ -897,7 +908,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
897
908
  _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
898
909
  db_updated = True
899
910
  # Close the engine to avoid connection leaks
900
- sqlalchemy_engine.dispose()
911
+ if dispose_engine:
912
+ sqlalchemy_engine.dispose()
901
913
 
902
914
  if not db_updated:
903
915
  # save to the local file (PVC in Kubernetes, local file otherwise)
@@ -15,7 +15,7 @@ router = fastapi.APIRouter()
15
15
 
16
16
 
17
17
  @router.get('')
18
- async def get_ssh_node_pools() -> Dict[str, Any]:
18
+ def get_ssh_node_pools() -> Dict[str, Any]:
19
19
  """Get all SSH Node Pool configurations."""
20
20
  try:
21
21
  return ssh_node_pools_core.get_all_pools()
@@ -27,7 +27,7 @@ async def get_ssh_node_pools() -> Dict[str, Any]:
27
27
 
28
28
 
29
29
  @router.post('')
30
- async def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
30
+ def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
31
31
  """Update SSH Node Pool configurations."""
32
32
  try:
33
33
  ssh_node_pools_core.update_pools(pools_config)
@@ -39,7 +39,7 @@ async def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
39
39
 
40
40
 
41
41
  @router.delete('/{pool_name}')
42
- async def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
42
+ def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
43
43
  """Delete a SSH Node Pool configuration."""
44
44
  try:
45
45
  if ssh_node_pools_core.delete_pool(pool_name):
@@ -83,7 +83,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
83
83
 
84
84
 
85
85
  @router.get('/keys')
86
- async def list_ssh_keys() -> List[str]:
86
+ def list_ssh_keys() -> List[str]:
87
87
  """List available SSH keys."""
88
88
  try:
89
89
  return ssh_node_pools_core.list_ssh_keys()
@@ -200,7 +200,7 @@ async def down_ssh_node_pool_general(
200
200
 
201
201
 
202
202
  @router.get('/{pool_name}/status')
203
- async def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
203
+ def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
204
204
  """Get the status of a specific SSH Node Pool."""
205
205
  try:
206
206
  # Call ssh_status to check the context