skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (93) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/client/cli/command.py +118 -30
  4. sky/client/cli/table_utils.py +14 -8
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  8. sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
  9. sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
  10. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/global_user_state.py +117 -17
  45. sky/jobs/client/sdk.py +28 -9
  46. sky/jobs/client/sdk_async.py +9 -3
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +7 -3
  49. sky/jobs/server/server.py +11 -11
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +281 -166
  52. sky/schemas/api/responses.py +2 -0
  53. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  54. sky/serve/server/server.py +7 -7
  55. sky/server/auth/oauth2_proxy.py +2 -5
  56. sky/server/common.py +1 -13
  57. sky/server/requests/executor.py +20 -20
  58. sky/server/requests/payloads.py +3 -0
  59. sky/server/requests/requests.py +51 -25
  60. sky/server/requests/serializers/decoders.py +23 -10
  61. sky/server/requests/serializers/encoders.py +5 -4
  62. sky/server/rest.py +35 -1
  63. sky/server/server.py +34 -34
  64. sky/setup_files/alembic.ini +4 -0
  65. sky/skylet/log_lib.py +8 -1
  66. sky/skylet/services.py +5 -5
  67. sky/skylet/subprocess_daemon.py +103 -29
  68. sky/skypilot_config.py +87 -75
  69. sky/ssh_node_pools/server.py +4 -4
  70. sky/users/permission.py +4 -0
  71. sky/utils/db/db_utils.py +32 -3
  72. sky/utils/db/migration_utils.py +7 -3
  73. sky/utils/subprocess_utils.py +13 -1
  74. sky/volumes/server/server.py +3 -3
  75. sky/workspaces/server.py +6 -6
  76. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
  77. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
  78. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  79. sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
  80. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  81. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  85. sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
  86. sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
  87. /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  89. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  90. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
  91. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
  93. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -17,7 +17,6 @@ import time
17
17
  import typing
18
18
  from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
19
  Tuple, TypeVar, Union)
20
- from urllib import parse
21
20
  import uuid
22
21
 
23
22
  import cachetools
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
342
341
  @annotations.lru_cache(scope='global')
343
342
  def get_dashboard_url(server_url: str,
344
343
  starting_page: Optional[str] = None) -> str:
345
- # The server_url may include username or password with the
346
- # format of https://username:password@example.com:8080/path
347
- # We need to remove the username and password and only
348
- # return `https://example.com:8080/path`
349
- parsed = parse.urlparse(server_url)
350
- # Reconstruct the URL without credentials but keeping the scheme
351
- dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
352
- if parsed.port:
353
- dashboard_url = f'{dashboard_url}:{parsed.port}'
354
- if parsed.path:
355
- dashboard_url = f'{dashboard_url}{parsed.path}'
356
- dashboard_url = dashboard_url.rstrip('/')
344
+ dashboard_url = server_url.rstrip('/')
357
345
  dashboard_url = f'{dashboard_url}/dashboard'
358
346
  if starting_page:
359
347
  dashboard_url = f'{dashboard_url}/{starting_page}'
@@ -329,10 +329,7 @@ def override_request_env_and_config(
329
329
  # through the execution.
330
330
  user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
331
331
  name=request_body.env_vars[constants.USER_ENV_VAR])
332
- global_user_state.add_or_update_user(user)
333
- # Refetch the user to get the latest user info, including the created_at
334
- # field.
335
- user = global_user_state.get_user(user.id)
332
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
336
333
 
337
334
  # Force color to be enabled.
338
335
  os.environ['CLICOLOR_FORCE'] = '1'
@@ -689,7 +686,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
689
686
  ctx.cancel()
690
687
 
691
688
 
692
- def prepare_request(
689
+ async def prepare_request_async(
693
690
  request_id: str,
694
691
  request_name: str,
695
692
  request_body: payloads.RequestBody,
@@ -715,7 +712,7 @@ def prepare_request(
715
712
  user_id=user_id,
716
713
  cluster_name=request_cluster_name)
717
714
 
718
- if not api_requests.create_if_not_exists(request):
715
+ if not await api_requests.create_if_not_exists_async(request):
719
716
  raise exceptions.RequestAlreadyExistsError(
720
717
  f'Request {request_id} already exists.')
721
718
 
@@ -723,17 +720,18 @@ def prepare_request(
723
720
  return request
724
721
 
725
722
 
726
- def schedule_request(request_id: str,
727
- request_name: str,
728
- request_body: payloads.RequestBody,
729
- func: Callable[P, Any],
730
- request_cluster_name: Optional[str] = None,
731
- ignore_return_value: bool = False,
732
- schedule_type: api_requests.ScheduleType = (
733
- api_requests.ScheduleType.LONG),
734
- is_skypilot_system: bool = False,
735
- precondition: Optional[preconditions.Precondition] = None,
736
- retryable: bool = False) -> None:
723
+ async def schedule_request_async(request_id: str,
724
+ request_name: str,
725
+ request_body: payloads.RequestBody,
726
+ func: Callable[P, Any],
727
+ request_cluster_name: Optional[str] = None,
728
+ ignore_return_value: bool = False,
729
+ schedule_type: api_requests.ScheduleType = (
730
+ api_requests.ScheduleType.LONG),
731
+ is_skypilot_system: bool = False,
732
+ precondition: Optional[
733
+ preconditions.Precondition] = None,
734
+ retryable: bool = False) -> None:
737
735
  """Enqueue a request to the request queue.
738
736
 
739
737
  Args:
@@ -754,9 +752,11 @@ def schedule_request(request_id: str,
754
752
  The precondition is waited asynchronously and does not block the
755
753
  caller.
756
754
  """
757
- request_task = prepare_request(request_id, request_name, request_body, func,
758
- request_cluster_name, schedule_type,
759
- is_skypilot_system)
755
+ request_task = await prepare_request_async(request_id, request_name,
756
+ request_body, func,
757
+ request_cluster_name,
758
+ schedule_type,
759
+ is_skypilot_system)
760
760
  schedule_prepared_request(request_task, ignore_return_value, precondition,
761
761
  retryable)
762
762
 
@@ -542,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
542
542
  page: Optional[int] = None
543
543
  limit: Optional[int] = None
544
544
  statuses: Optional[List[str]] = None
545
+ # The fields to return in the response.
546
+ # Refer to the fields in the `class ManagedJobRecord` in `response.py`
547
+ fields: Optional[List[str]] = None
545
548
 
546
549
 
547
550
  class JobsCancelBody(RequestBody):
@@ -16,6 +16,7 @@ import time
16
16
  import traceback
17
17
  from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
18
18
  Tuple)
19
+ import uuid
19
20
 
20
21
  import anyio
21
22
  import colorama
@@ -293,6 +294,11 @@ class Request:
293
294
  raise
294
295
 
295
296
 
297
+ def get_new_request_id() -> str:
298
+ """Get a new request ID."""
299
+ return str(uuid.uuid4())
300
+
301
+
296
302
  def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
297
303
  """Serialize the SkyPilot API request for display purposes.
298
304
 
@@ -572,6 +578,26 @@ def reset_db_and_logs():
572
578
  f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
573
579
  shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
574
580
  ignore_errors=True)
581
+ with _init_db_lock:
582
+ _init_db_within_lock()
583
+ assert _DB is not None
584
+ with _DB.conn:
585
+ cursor = _DB.conn.cursor()
586
+ cursor.execute('SELECT sqlite_version()')
587
+ row = cursor.fetchone()
588
+ if row is None:
589
+ raise RuntimeError('Failed to get SQLite version')
590
+ version_str = row[0]
591
+ version_parts = version_str.split('.')
592
+ assert len(version_parts) >= 2, \
593
+ f'Invalid version string: {version_str}'
594
+ major, minor = int(version_parts[0]), int(version_parts[1])
595
+ # SQLite 3.35.0+ supports RETURNING statements.
596
+ # 3.35.0 was released in March 2021.
597
+ if not ((major > 3) or (major == 3 and minor >= 35)):
598
+ raise RuntimeError(
599
+ f'SQLite version {version_str} is not supported. '
600
+ 'Please upgrade to SQLite 3.35.0 or later.')
575
601
 
576
602
 
577
603
  def request_lock_path(request_id: str) -> str:
@@ -657,17 +683,15 @@ async def _get_request_no_lock_async(
657
683
  return Request.from_row(row)
658
684
 
659
685
 
660
- @init_db
686
+ @init_db_async
661
687
  @metrics_lib.time_me
662
- def get_latest_request_id() -> Optional[str]:
688
+ async def get_latest_request_id_async() -> Optional[str]:
663
689
  """Get the latest request ID."""
664
690
  assert _DB is not None
665
- with _DB.conn:
666
- cursor = _DB.conn.cursor()
667
- cursor.execute(f'SELECT request_id FROM {REQUEST_TABLE} '
668
- 'ORDER BY created_at DESC LIMIT 1')
669
- row = cursor.fetchone()
670
- return row[0] if row else None
691
+ async with _DB.execute_fetchall_async(
692
+ (f'SELECT request_id FROM {REQUEST_TABLE} '
693
+ 'ORDER BY created_at DESC LIMIT 1')) as rows:
694
+ return rows[0][0] if rows else None
671
695
 
672
696
 
673
697
  @init_db
@@ -725,27 +749,29 @@ async def get_request_status_async(
725
749
  return StatusWithMsg(status, status_msg)
726
750
 
727
751
 
728
- @init_db
729
- @metrics_lib.time_me
730
- def create_if_not_exists(request: Request) -> bool:
731
- """Create a SkyPilot API request if it does not exist."""
732
- with filelock.FileLock(request_lock_path(request.request_id)):
733
- if _get_request_no_lock(request.request_id) is not None:
734
- return False
735
- _add_or_update_request_no_lock(request)
736
- return True
737
-
738
-
739
752
  @init_db_async
740
753
  @metrics_lib.time_me_async
741
754
  @asyncio_utils.shield
742
755
  async def create_if_not_exists_async(request: Request) -> bool:
743
- """Async version of create_if_not_exists."""
744
- async with filelock.AsyncFileLock(request_lock_path(request.request_id)):
745
- if await _get_request_no_lock_async(request.request_id) is not None:
746
- return False
747
- await _add_or_update_request_no_lock_async(request)
748
- return True
756
+ """Create a request if it does not exist, otherwise do nothing.
757
+
758
+ Returns:
759
+ True if a new request is created, False if the request already exists.
760
+ """
761
+ assert _DB is not None
762
+ request_columns = ', '.join(REQUEST_COLUMNS)
763
+ values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
764
+ sql_statement = (
765
+ f'INSERT INTO {REQUEST_TABLE} '
766
+ f'({request_columns}) VALUES '
767
+ f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
768
+ request_row = request.to_row()
769
+ # Execute the SQL statement without getting the request lock.
770
+ # The request lock is used to prevent racing with cancellation codepath,
771
+ # but a request cannot be cancelled before it is created.
772
+ row = await _DB.execute_get_returning_value_async(sql_statement,
773
+ request_row)
774
+ return True if row else False
749
775
 
750
776
 
751
777
  @dataclasses.dataclass
@@ -2,7 +2,7 @@
2
2
  import base64
3
3
  import pickle
4
4
  import typing
5
- from typing import Any, Dict, List, Optional, Tuple
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import jobs as managed_jobs
8
8
  from sky import models
@@ -116,22 +116,35 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
116
116
 
117
117
 
118
118
  @register_decoders('jobs.queue_v2')
119
- def decode_jobs_queue_v2(return_value) -> List[responses.ManagedJobRecord]:
119
+ def decode_jobs_queue_v2(
120
+ return_value
121
+ ) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
122
+ List[responses.ManagedJobRecord]]:
120
123
  """Decode jobs queue response.
121
124
 
122
- Supports legacy list, or a dict {jobs, total}.
123
- - Returns list[job]
125
+ Supports legacy list, or a dict {jobs, total, total_no_filter,
126
+ status_counts}.
127
+
128
+ - Returns either list[job] or tuple(list[job], total, status_counts,
129
+ total_no_filter)
124
130
  """
125
- # Case 1: dict shape {jobs, total}
126
- if isinstance(return_value, dict) and 'jobs' in return_value:
131
+ # Case 1: dict shape {jobs, total, total_no_filter, status_counts}
132
+ if isinstance(return_value, dict):
127
133
  jobs = return_value.get('jobs', [])
134
+ total = return_value.get('total', len(jobs))
135
+ total_no_filter = return_value.get('total_no_filter', total)
136
+ status_counts = return_value.get('status_counts', {})
137
+ for job in jobs:
138
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
139
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
140
+ return jobs, total, status_counts, total_no_filter
128
141
  else:
129
142
  # Case 2: legacy list
130
143
  jobs = return_value
131
- for job in jobs:
132
- job['status'] = managed_jobs.ManagedJobStatus(job['status'])
133
- jobs = [responses.ManagedJobRecord(**job) for job in jobs]
134
- return jobs
144
+ for job in jobs:
145
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
146
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
147
+ return jobs
135
148
 
136
149
 
137
150
  def _decode_serve_status(
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
121
121
  encoded_cluster = dataclasses.asdict(cluster)
122
122
  encoded_cluster['status'] = encoded_cluster['status'].value
123
123
  encoded_unmanaged_clusters.append(encoded_cluster)
124
- all_jobs = [job.model_dump() for job in all_jobs]
124
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
125
125
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
126
126
 
127
127
 
@@ -148,12 +148,13 @@ def encode_jobs_queue_v2(
148
148
  else:
149
149
  jobs = jobs_or_tuple
150
150
  total = None
151
- for job in jobs:
151
+ jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
152
+ for job in jobs_dict:
152
153
  job['status'] = job['status'].value
153
154
  if total is None:
154
- return [job.model_dump() for job in jobs]
155
+ return jobs_dict
155
156
  return {
156
- 'jobs': [job.model_dump() for job in jobs],
157
+ 'jobs': jobs_dict,
157
158
  'total': total,
158
159
  'total_no_filter': total_no_filter,
159
160
  'status_counts': status_counts
sky/server/rest.py CHANGED
@@ -256,6 +256,40 @@ def handle_server_unavailable(response: 'requests.Response') -> None:
256
256
  raise exceptions.ServerTemporarilyUnavailableError(error_msg)
257
257
 
258
258
 
259
+ async def handle_server_unavailable_async(
260
+ response: 'aiohttp.ClientResponse') -> None:
261
+ """Async version: Handle 503 (Service Unavailable) error
262
+
263
+ The client get 503 error in the following cases:
264
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
265
+ request, e.g. when there is and rolling-update.
266
+ 2. The skypilot API server has temporary resource issue, e.g. when the
267
+ cucurrency of the handling process is exhausted.
268
+
269
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
270
+ message to the user to let user decide whether keep waiting or abort the
271
+ request.
272
+ """
273
+ if response.status != 503:
274
+ return
275
+
276
+ error_msg = ''
277
+ try:
278
+ response_data = await response.json()
279
+ if 'detail' in response_data:
280
+ error_msg = response_data['detail']
281
+ except Exception: # pylint: disable=broad-except
282
+ try:
283
+ text = await response.text()
284
+ if text:
285
+ error_msg = text
286
+ except Exception: # pylint: disable=broad-except
287
+ pass
288
+
289
+ with ux_utils.print_exception_no_traceback():
290
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
291
+
292
+
259
293
  @_retry_on_server_unavailable()
260
294
  def request(method, url, **kwargs) -> 'requests.Response':
261
295
  """Send a request to the API server, retry on server temporarily
@@ -332,7 +366,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
332
366
  response = await session.request(method, url, **kwargs)
333
367
 
334
368
  # Handle server unavailability (503 status) - same as sync version
335
- handle_server_unavailable(response)
369
+ await handle_server_unavailable_async(response)
336
370
 
337
371
  # Set remote API version and version from headers - same as sync version
338
372
  remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
sky/server/server.py CHANGED
@@ -163,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
163
163
  """Middleware to add a request ID to each request."""
164
164
 
165
165
  async def dispatch(self, request: fastapi.Request, call_next):
166
- request_id = str(uuid.uuid4())
166
+ request_id = requests_lib.get_new_request_id()
167
167
  request.state.request_id = request_id
168
168
  response = await call_next(request)
169
169
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
@@ -455,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
455
455
  loop.call_at(target, tick)
456
456
 
457
457
 
458
- def schedule_on_boot_check():
458
+ async def schedule_on_boot_check_async():
459
459
  try:
460
- executor.schedule_request(
460
+ await executor.schedule_request_async(
461
461
  request_id='skypilot-server-on-boot-check',
462
462
  request_name='check',
463
463
  request_body=payloads.CheckBody(),
@@ -480,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
480
480
  if event.should_skip():
481
481
  continue
482
482
  try:
483
- executor.schedule_request(
483
+ await executor.schedule_request_async(
484
484
  request_id=event.id,
485
485
  request_name=event.name,
486
486
  request_body=payloads.RequestBody(),
@@ -495,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
495
495
  # Lifespan will be executed in each uvicorn worker process, we
496
496
  # can safely ignore the error if the task is already scheduled.
497
497
  logger.debug(f'Request {event.id} already exists.')
498
- schedule_on_boot_check()
498
+ await schedule_on_boot_check_async()
499
499
  asyncio.create_task(cleanup_upload_ids())
500
500
  if metrics_utils.METRICS_ENABLED:
501
501
  # Start monitoring the event loop lag in each server worker
@@ -729,7 +729,7 @@ async def token(request: fastapi.Request,
729
729
  async def check(request: fastapi.Request,
730
730
  check_body: payloads.CheckBody) -> None:
731
731
  """Checks enabled clouds."""
732
- executor.schedule_request(
732
+ await executor.schedule_request_async(
733
733
  request_id=request.state.request_id,
734
734
  request_name='check',
735
735
  request_body=check_body,
@@ -743,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
743
743
  workspace: Optional[str] = None,
744
744
  expand: bool = False) -> None:
745
745
  """Gets enabled clouds on the server."""
746
- executor.schedule_request(
746
+ await executor.schedule_request_async(
747
747
  request_id=request.state.request_id,
748
748
  request_name='enabled_clouds',
749
749
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
@@ -759,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
759
759
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
760
760
  ) -> None:
761
761
  """Gets real-time Kubernetes GPU availability."""
762
- executor.schedule_request(
762
+ await executor.schedule_request_async(
763
763
  request_id=request.state.request_id,
764
764
  request_name='realtime_kubernetes_gpu_availability',
765
765
  request_body=realtime_gpu_availability_body,
@@ -774,7 +774,7 @@ async def kubernetes_node_info(
774
774
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
775
775
  ) -> None:
776
776
  """Gets Kubernetes nodes information and hints."""
777
- executor.schedule_request(
777
+ await executor.schedule_request_async(
778
778
  request_id=request.state.request_id,
779
779
  request_name='kubernetes_node_info',
780
780
  request_body=kubernetes_node_info_body,
@@ -786,7 +786,7 @@ async def kubernetes_node_info(
786
786
  @app.get('/status_kubernetes')
787
787
  async def status_kubernetes(request: fastapi.Request) -> None:
788
788
  """Gets Kubernetes status."""
789
- executor.schedule_request(
789
+ await executor.schedule_request_async(
790
790
  request_id=request.state.request_id,
791
791
  request_name='status_kubernetes',
792
792
  request_body=payloads.RequestBody(),
@@ -800,7 +800,7 @@ async def list_accelerators(
800
800
  request: fastapi.Request,
801
801
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
802
802
  """Gets list of accelerators from cloud catalog."""
803
- executor.schedule_request(
803
+ await executor.schedule_request_async(
804
804
  request_id=request.state.request_id,
805
805
  request_name='list_accelerators',
806
806
  request_body=list_accelerator_counts_body,
@@ -815,7 +815,7 @@ async def list_accelerator_counts(
815
815
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
816
816
  ) -> None:
817
817
  """Gets list of accelerator counts from cloud catalog."""
818
- executor.schedule_request(
818
+ await executor.schedule_request_async(
819
819
  request_id=request.state.request_id,
820
820
  request_name='list_accelerator_counts',
821
821
  request_body=list_accelerator_counts_body,
@@ -872,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
872
872
  async def optimize(optimize_body: payloads.OptimizeBody,
873
873
  request: fastapi.Request) -> None:
874
874
  """Optimizes the user's DAG."""
875
- executor.schedule_request(
875
+ await executor.schedule_request_async(
876
876
  request_id=request.state.request_id,
877
877
  request_name='optimize',
878
878
  request_body=optimize_body,
@@ -1082,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
1082
1082
  """Launches a cluster or task."""
1083
1083
  request_id = request.state.request_id
1084
1084
  logger.info(f'Launching request: {request_id}')
1085
- executor.schedule_request(
1085
+ await executor.schedule_request_async(
1086
1086
  request_id,
1087
1087
  request_name='launch',
1088
1088
  request_body=launch_body,
@@ -1098,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
1098
1098
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1099
1099
  """Executes a task on an existing cluster."""
1100
1100
  cluster_name = exec_body.cluster_name
1101
- executor.schedule_request(
1101
+ await executor.schedule_request_async(
1102
1102
  request_id=request.state.request_id,
1103
1103
  request_name='exec',
1104
1104
  request_body=exec_body,
@@ -1116,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1116
1116
  async def stop(request: fastapi.Request,
1117
1117
  stop_body: payloads.StopOrDownBody) -> None:
1118
1118
  """Stops a cluster."""
1119
- executor.schedule_request(
1119
+ await executor.schedule_request_async(
1120
1120
  request_id=request.state.request_id,
1121
1121
  request_name='stop',
1122
1122
  request_body=stop_body,
@@ -1136,7 +1136,7 @@ async def status(
1136
1136
  raise fastapi.HTTPException(
1137
1137
  status_code=503,
1138
1138
  detail='Server is shutting down, please try again later.')
1139
- executor.schedule_request(
1139
+ await executor.schedule_request_async(
1140
1140
  request_id=request.state.request_id,
1141
1141
  request_name='status',
1142
1142
  request_body=status_body,
@@ -1151,7 +1151,7 @@ async def status(
1151
1151
  async def endpoints(request: fastapi.Request,
1152
1152
  endpoint_body: payloads.EndpointsBody) -> None:
1153
1153
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1154
- executor.schedule_request(
1154
+ await executor.schedule_request_async(
1155
1155
  request_id=request.state.request_id,
1156
1156
  request_name='endpoints',
1157
1157
  request_body=endpoint_body,
@@ -1165,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
1165
1165
  async def down(request: fastapi.Request,
1166
1166
  down_body: payloads.StopOrDownBody) -> None:
1167
1167
  """Tears down a cluster."""
1168
- executor.schedule_request(
1168
+ await executor.schedule_request_async(
1169
1169
  request_id=request.state.request_id,
1170
1170
  request_name='down',
1171
1171
  request_body=down_body,
@@ -1179,7 +1179,7 @@ async def down(request: fastapi.Request,
1179
1179
  async def start(request: fastapi.Request,
1180
1180
  start_body: payloads.StartBody) -> None:
1181
1181
  """Restarts a cluster."""
1182
- executor.schedule_request(
1182
+ await executor.schedule_request_async(
1183
1183
  request_id=request.state.request_id,
1184
1184
  request_name='start',
1185
1185
  request_body=start_body,
@@ -1193,7 +1193,7 @@ async def start(request: fastapi.Request,
1193
1193
  async def autostop(request: fastapi.Request,
1194
1194
  autostop_body: payloads.AutostopBody) -> None:
1195
1195
  """Schedules an autostop/autodown for a cluster."""
1196
- executor.schedule_request(
1196
+ await executor.schedule_request_async(
1197
1197
  request_id=request.state.request_id,
1198
1198
  request_name='autostop',
1199
1199
  request_body=autostop_body,
@@ -1207,7 +1207,7 @@ async def autostop(request: fastapi.Request,
1207
1207
  async def queue(request: fastapi.Request,
1208
1208
  queue_body: payloads.QueueBody) -> None:
1209
1209
  """Gets the job queue of a cluster."""
1210
- executor.schedule_request(
1210
+ await executor.schedule_request_async(
1211
1211
  request_id=request.state.request_id,
1212
1212
  request_name='queue',
1213
1213
  request_body=queue_body,
@@ -1221,7 +1221,7 @@ async def queue(request: fastapi.Request,
1221
1221
  async def job_status(request: fastapi.Request,
1222
1222
  job_status_body: payloads.JobStatusBody) -> None:
1223
1223
  """Gets the status of a job."""
1224
- executor.schedule_request(
1224
+ await executor.schedule_request_async(
1225
1225
  request_id=request.state.request_id,
1226
1226
  request_name='job_status',
1227
1227
  request_body=job_status_body,
@@ -1235,7 +1235,7 @@ async def job_status(request: fastapi.Request,
1235
1235
  async def cancel(request: fastapi.Request,
1236
1236
  cancel_body: payloads.CancelBody) -> None:
1237
1237
  """Cancels jobs on a cluster."""
1238
- executor.schedule_request(
1238
+ await executor.schedule_request_async(
1239
1239
  request_id=request.state.request_id,
1240
1240
  request_name='cancel',
1241
1241
  request_body=cancel_body,
@@ -1255,7 +1255,7 @@ async def logs(
1255
1255
  # launch, to finish, so that a user does not need to manually pull the
1256
1256
  # request status.
1257
1257
  executor.check_request_thread_executor_available()
1258
- request_task = executor.prepare_request(
1258
+ request_task = await executor.prepare_request_async(
1259
1259
  request_id=request.state.request_id,
1260
1260
  request_name='logs',
1261
1261
  request_body=cluster_job_body,
@@ -1286,7 +1286,7 @@ async def download_logs(
1286
1286
  # We should reuse the original request body, so that the env vars, such as
1287
1287
  # user hash, are kept the same.
1288
1288
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1289
- executor.schedule_request(
1289
+ await executor.schedule_request_async(
1290
1290
  request_id=request.state.request_id,
1291
1291
  request_name='download_logs',
1292
1292
  request_body=cluster_jobs_body,
@@ -1437,7 +1437,7 @@ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1437
1437
  async def cost_report(request: fastapi.Request,
1438
1438
  cost_report_body: payloads.CostReportBody) -> None:
1439
1439
  """Gets the cost report of a cluster."""
1440
- executor.schedule_request(
1440
+ await executor.schedule_request_async(
1441
1441
  request_id=request.state.request_id,
1442
1442
  request_name='cost_report',
1443
1443
  request_body=cost_report_body,
@@ -1449,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
1449
1449
  @app.get('/storage/ls')
1450
1450
  async def storage_ls(request: fastapi.Request) -> None:
1451
1451
  """Gets the storages."""
1452
- executor.schedule_request(
1452
+ await executor.schedule_request_async(
1453
1453
  request_id=request.state.request_id,
1454
1454
  request_name='storage_ls',
1455
1455
  request_body=payloads.RequestBody(),
@@ -1462,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
1462
1462
  async def storage_delete(request: fastapi.Request,
1463
1463
  storage_body: payloads.StorageBody) -> None:
1464
1464
  """Deletes a storage."""
1465
- executor.schedule_request(
1465
+ await executor.schedule_request_async(
1466
1466
  request_id=request.state.request_id,
1467
1467
  request_name='storage_delete',
1468
1468
  request_body=storage_body,
@@ -1475,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
1475
1475
  async def local_up(request: fastapi.Request,
1476
1476
  local_up_body: payloads.LocalUpBody) -> None:
1477
1477
  """Launches a Kubernetes cluster on API server."""
1478
- executor.schedule_request(
1478
+ await executor.schedule_request_async(
1479
1479
  request_id=request.state.request_id,
1480
1480
  request_name='local_up',
1481
1481
  request_body=local_up_body,
@@ -1488,7 +1488,7 @@ async def local_up(request: fastapi.Request,
1488
1488
  async def local_down(request: fastapi.Request,
1489
1489
  local_down_body: payloads.LocalDownBody) -> None:
1490
1490
  """Tears down the Kubernetes cluster started by local_up."""
1491
- executor.schedule_request(
1491
+ await executor.schedule_request_async(
1492
1492
  request_id=request.state.request_id,
1493
1493
  request_name='local_down',
1494
1494
  request_body=local_down_body,
@@ -1566,7 +1566,7 @@ async def stream(
1566
1566
  detail='Only one of request_id and log_path can be provided')
1567
1567
 
1568
1568
  if request_id is None and log_path is None:
1569
- request_id = requests_lib.get_latest_request_id()
1569
+ request_id = await requests_lib.get_latest_request_id_async()
1570
1570
  if request_id is None:
1571
1571
  raise fastapi.HTTPException(status_code=404,
1572
1572
  detail='No request found')
@@ -1672,7 +1672,7 @@ async def stream(
1672
1672
  async def api_cancel(request: fastapi.Request,
1673
1673
  request_cancel_body: payloads.RequestCancelBody) -> None:
1674
1674
  """Cancels requests."""
1675
- executor.schedule_request(
1675
+ await executor.schedule_request_async(
1676
1676
  request_id=request.state.request_id,
1677
1677
  request_name='api_cancel',
1678
1678
  request_body=request_cancel_body,
@@ -1908,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1908
1908
  async def all_contexts(request: fastapi.Request) -> None:
1909
1909
  """Gets all Kubernetes and SSH node pool contexts."""
1910
1910
 
1911
- executor.schedule_request(
1911
+ await executor.schedule_request_async(
1912
1912
  request_id=request.state.request_id,
1913
1913
  request_name='all_contexts',
1914
1914
  request_body=payloads.RequestBody(),
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
98
98
  version_locations = %(here)s/../schemas/db/serve_state
99
99
  version_table = alembic_version_serve_state_db
100
100
 
101
+ [sky_config_db]
102
+ version_locations = %(here)s/../schemas/db/skypilot_config
103
+ version_table = alembic_version_sky_config_db
104
+
101
105
  [post_write_hooks]
102
106
  # post_write_hooks defines scripts or Python functions that are run
103
107
  # on newly generated revision scripts. See the documentation for further