skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/client/cli/command.py +118 -30
- sky/client/cli/table_utils.py +14 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
- sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/client/sdk.py +28 -9
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +7 -3
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +281 -166
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +51 -25
- sky/server/requests/serializers/decoders.py +23 -10
- sky/server/requests/serializers/encoders.py +5 -4
- sky/server/rest.py +35 -1
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/log_lib.py +8 -1
- sky/skylet/services.py +5 -5
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +32 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/utils/subprocess_utils.py +13 -1
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/server/common.py
CHANGED
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
-
from urllib import parse
|
|
21
20
|
import uuid
|
|
22
21
|
|
|
23
22
|
import cachetools
|
|
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
342
341
|
@annotations.lru_cache(scope='global')
|
|
343
342
|
def get_dashboard_url(server_url: str,
|
|
344
343
|
starting_page: Optional[str] = None) -> str:
|
|
345
|
-
|
|
346
|
-
# format of https://username:password@example.com:8080/path
|
|
347
|
-
# We need to remove the username and password and only
|
|
348
|
-
# return `https://example.com:8080/path`
|
|
349
|
-
parsed = parse.urlparse(server_url)
|
|
350
|
-
# Reconstruct the URL without credentials but keeping the scheme
|
|
351
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
352
|
-
if parsed.port:
|
|
353
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
354
|
-
if parsed.path:
|
|
355
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
356
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
357
345
|
dashboard_url = f'{dashboard_url}/dashboard'
|
|
358
346
|
if starting_page:
|
|
359
347
|
dashboard_url = f'{dashboard_url}/{starting_page}'
|
sky/server/requests/executor.py
CHANGED
|
@@ -329,10 +329,7 @@ def override_request_env_and_config(
|
|
|
329
329
|
# through the execution.
|
|
330
330
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
331
331
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
332
|
-
global_user_state.add_or_update_user(user)
|
|
333
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
334
|
-
# field.
|
|
335
|
-
user = global_user_state.get_user(user.id)
|
|
332
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
336
333
|
|
|
337
334
|
# Force color to be enabled.
|
|
338
335
|
os.environ['CLICOLOR_FORCE'] = '1'
|
|
@@ -689,7 +686,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
689
686
|
ctx.cancel()
|
|
690
687
|
|
|
691
688
|
|
|
692
|
-
def
|
|
689
|
+
async def prepare_request_async(
|
|
693
690
|
request_id: str,
|
|
694
691
|
request_name: str,
|
|
695
692
|
request_body: payloads.RequestBody,
|
|
@@ -715,7 +712,7 @@ def prepare_request(
|
|
|
715
712
|
user_id=user_id,
|
|
716
713
|
cluster_name=request_cluster_name)
|
|
717
714
|
|
|
718
|
-
if not api_requests.
|
|
715
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
719
716
|
raise exceptions.RequestAlreadyExistsError(
|
|
720
717
|
f'Request {request_id} already exists.')
|
|
721
718
|
|
|
@@ -723,17 +720,18 @@ def prepare_request(
|
|
|
723
720
|
return request
|
|
724
721
|
|
|
725
722
|
|
|
726
|
-
def
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
723
|
+
async def schedule_request_async(request_id: str,
|
|
724
|
+
request_name: str,
|
|
725
|
+
request_body: payloads.RequestBody,
|
|
726
|
+
func: Callable[P, Any],
|
|
727
|
+
request_cluster_name: Optional[str] = None,
|
|
728
|
+
ignore_return_value: bool = False,
|
|
729
|
+
schedule_type: api_requests.ScheduleType = (
|
|
730
|
+
api_requests.ScheduleType.LONG),
|
|
731
|
+
is_skypilot_system: bool = False,
|
|
732
|
+
precondition: Optional[
|
|
733
|
+
preconditions.Precondition] = None,
|
|
734
|
+
retryable: bool = False) -> None:
|
|
737
735
|
"""Enqueue a request to the request queue.
|
|
738
736
|
|
|
739
737
|
Args:
|
|
@@ -754,9 +752,11 @@ def schedule_request(request_id: str,
|
|
|
754
752
|
The precondition is waited asynchronously and does not block the
|
|
755
753
|
caller.
|
|
756
754
|
"""
|
|
757
|
-
request_task =
|
|
758
|
-
|
|
759
|
-
|
|
755
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
756
|
+
request_body, func,
|
|
757
|
+
request_cluster_name,
|
|
758
|
+
schedule_type,
|
|
759
|
+
is_skypilot_system)
|
|
760
760
|
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
761
761
|
retryable)
|
|
762
762
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -542,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
|
|
|
542
542
|
page: Optional[int] = None
|
|
543
543
|
limit: Optional[int] = None
|
|
544
544
|
statuses: Optional[List[str]] = None
|
|
545
|
+
# The fields to return in the response.
|
|
546
|
+
# Refer to the fields in the `class ManagedJobRecord` in `response.py`
|
|
547
|
+
fields: Optional[List[str]] = None
|
|
545
548
|
|
|
546
549
|
|
|
547
550
|
class JobsCancelBody(RequestBody):
|
sky/server/requests/requests.py
CHANGED
|
@@ -16,6 +16,7 @@ import time
|
|
|
16
16
|
import traceback
|
|
17
17
|
from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
|
|
18
18
|
Tuple)
|
|
19
|
+
import uuid
|
|
19
20
|
|
|
20
21
|
import anyio
|
|
21
22
|
import colorama
|
|
@@ -293,6 +294,11 @@ class Request:
|
|
|
293
294
|
raise
|
|
294
295
|
|
|
295
296
|
|
|
297
|
+
def get_new_request_id() -> str:
|
|
298
|
+
"""Get a new request ID."""
|
|
299
|
+
return str(uuid.uuid4())
|
|
300
|
+
|
|
301
|
+
|
|
296
302
|
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
297
303
|
"""Serialize the SkyPilot API request for display purposes.
|
|
298
304
|
|
|
@@ -572,6 +578,26 @@ def reset_db_and_logs():
|
|
|
572
578
|
f'{server_common.API_SERVER_CLIENT_DIR.expanduser()}')
|
|
573
579
|
shutil.rmtree(server_common.API_SERVER_CLIENT_DIR.expanduser(),
|
|
574
580
|
ignore_errors=True)
|
|
581
|
+
with _init_db_lock:
|
|
582
|
+
_init_db_within_lock()
|
|
583
|
+
assert _DB is not None
|
|
584
|
+
with _DB.conn:
|
|
585
|
+
cursor = _DB.conn.cursor()
|
|
586
|
+
cursor.execute('SELECT sqlite_version()')
|
|
587
|
+
row = cursor.fetchone()
|
|
588
|
+
if row is None:
|
|
589
|
+
raise RuntimeError('Failed to get SQLite version')
|
|
590
|
+
version_str = row[0]
|
|
591
|
+
version_parts = version_str.split('.')
|
|
592
|
+
assert len(version_parts) >= 2, \
|
|
593
|
+
f'Invalid version string: {version_str}'
|
|
594
|
+
major, minor = int(version_parts[0]), int(version_parts[1])
|
|
595
|
+
# SQLite 3.35.0+ supports RETURNING statements.
|
|
596
|
+
# 3.35.0 was released in March 2021.
|
|
597
|
+
if not ((major > 3) or (major == 3 and minor >= 35)):
|
|
598
|
+
raise RuntimeError(
|
|
599
|
+
f'SQLite version {version_str} is not supported. '
|
|
600
|
+
'Please upgrade to SQLite 3.35.0 or later.')
|
|
575
601
|
|
|
576
602
|
|
|
577
603
|
def request_lock_path(request_id: str) -> str:
|
|
@@ -657,17 +683,15 @@ async def _get_request_no_lock_async(
|
|
|
657
683
|
return Request.from_row(row)
|
|
658
684
|
|
|
659
685
|
|
|
660
|
-
@
|
|
686
|
+
@init_db_async
|
|
661
687
|
@metrics_lib.time_me
|
|
662
|
-
def
|
|
688
|
+
async def get_latest_request_id_async() -> Optional[str]:
|
|
663
689
|
"""Get the latest request ID."""
|
|
664
690
|
assert _DB is not None
|
|
665
|
-
with _DB.
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
row = cursor.fetchone()
|
|
670
|
-
return row[0] if row else None
|
|
691
|
+
async with _DB.execute_fetchall_async(
|
|
692
|
+
(f'SELECT request_id FROM {REQUEST_TABLE} '
|
|
693
|
+
'ORDER BY created_at DESC LIMIT 1')) as rows:
|
|
694
|
+
return rows[0][0] if rows else None
|
|
671
695
|
|
|
672
696
|
|
|
673
697
|
@init_db
|
|
@@ -725,27 +749,29 @@ async def get_request_status_async(
|
|
|
725
749
|
return StatusWithMsg(status, status_msg)
|
|
726
750
|
|
|
727
751
|
|
|
728
|
-
@init_db
|
|
729
|
-
@metrics_lib.time_me
|
|
730
|
-
def create_if_not_exists(request: Request) -> bool:
|
|
731
|
-
"""Create a SkyPilot API request if it does not exist."""
|
|
732
|
-
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
733
|
-
if _get_request_no_lock(request.request_id) is not None:
|
|
734
|
-
return False
|
|
735
|
-
_add_or_update_request_no_lock(request)
|
|
736
|
-
return True
|
|
737
|
-
|
|
738
|
-
|
|
739
752
|
@init_db_async
|
|
740
753
|
@metrics_lib.time_me_async
|
|
741
754
|
@asyncio_utils.shield
|
|
742
755
|
async def create_if_not_exists_async(request: Request) -> bool:
|
|
743
|
-
"""
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
756
|
+
"""Create a request if it does not exist, otherwise do nothing.
|
|
757
|
+
|
|
758
|
+
Returns:
|
|
759
|
+
True if a new request is created, False if the request already exists.
|
|
760
|
+
"""
|
|
761
|
+
assert _DB is not None
|
|
762
|
+
request_columns = ', '.join(REQUEST_COLUMNS)
|
|
763
|
+
values_str = ', '.join(['?'] * len(REQUEST_COLUMNS))
|
|
764
|
+
sql_statement = (
|
|
765
|
+
f'INSERT INTO {REQUEST_TABLE} '
|
|
766
|
+
f'({request_columns}) VALUES '
|
|
767
|
+
f'({values_str}) ON CONFLICT(request_id) DO NOTHING RETURNING ROWID')
|
|
768
|
+
request_row = request.to_row()
|
|
769
|
+
# Execute the SQL statement without getting the request lock.
|
|
770
|
+
# The request lock is used to prevent racing with cancellation codepath,
|
|
771
|
+
# but a request cannot be cancelled before it is created.
|
|
772
|
+
row = await _DB.execute_get_returning_value_async(sql_statement,
|
|
773
|
+
request_row)
|
|
774
|
+
return True if row else False
|
|
749
775
|
|
|
750
776
|
|
|
751
777
|
@dataclasses.dataclass
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import pickle
|
|
4
4
|
import typing
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
from sky import jobs as managed_jobs
|
|
8
8
|
from sky import models
|
|
@@ -116,22 +116,35 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
116
116
|
|
|
117
117
|
|
|
118
118
|
@register_decoders('jobs.queue_v2')
|
|
119
|
-
def decode_jobs_queue_v2(
|
|
119
|
+
def decode_jobs_queue_v2(
|
|
120
|
+
return_value
|
|
121
|
+
) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
|
|
122
|
+
List[responses.ManagedJobRecord]]:
|
|
120
123
|
"""Decode jobs queue response.
|
|
121
124
|
|
|
122
|
-
Supports legacy list, or a dict {jobs, total
|
|
123
|
-
|
|
125
|
+
Supports legacy list, or a dict {jobs, total, total_no_filter,
|
|
126
|
+
status_counts}.
|
|
127
|
+
|
|
128
|
+
- Returns either list[job] or tuple(list[job], total, status_counts,
|
|
129
|
+
total_no_filter)
|
|
124
130
|
"""
|
|
125
|
-
# Case 1: dict shape {jobs, total}
|
|
126
|
-
if isinstance(return_value, dict)
|
|
131
|
+
# Case 1: dict shape {jobs, total, total_no_filter, status_counts}
|
|
132
|
+
if isinstance(return_value, dict):
|
|
127
133
|
jobs = return_value.get('jobs', [])
|
|
134
|
+
total = return_value.get('total', len(jobs))
|
|
135
|
+
total_no_filter = return_value.get('total_no_filter', total)
|
|
136
|
+
status_counts = return_value.get('status_counts', {})
|
|
137
|
+
for job in jobs:
|
|
138
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
139
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
140
|
+
return jobs, total, status_counts, total_no_filter
|
|
128
141
|
else:
|
|
129
142
|
# Case 2: legacy list
|
|
130
143
|
jobs = return_value
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
144
|
+
for job in jobs:
|
|
145
|
+
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
146
|
+
jobs = [responses.ManagedJobRecord(**job) for job in jobs]
|
|
147
|
+
return jobs
|
|
135
148
|
|
|
136
149
|
|
|
137
150
|
def _decode_serve_status(
|
|
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
|
|
|
121
121
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
122
122
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
123
123
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
124
|
-
all_jobs = [job.model_dump() for job in all_jobs]
|
|
124
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
125
125
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
126
126
|
|
|
127
127
|
|
|
@@ -148,12 +148,13 @@ def encode_jobs_queue_v2(
|
|
|
148
148
|
else:
|
|
149
149
|
jobs = jobs_or_tuple
|
|
150
150
|
total = None
|
|
151
|
-
for job in jobs
|
|
151
|
+
jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
|
|
152
|
+
for job in jobs_dict:
|
|
152
153
|
job['status'] = job['status'].value
|
|
153
154
|
if total is None:
|
|
154
|
-
return
|
|
155
|
+
return jobs_dict
|
|
155
156
|
return {
|
|
156
|
-
'jobs':
|
|
157
|
+
'jobs': jobs_dict,
|
|
157
158
|
'total': total,
|
|
158
159
|
'total_no_filter': total_no_filter,
|
|
159
160
|
'status_counts': status_counts
|
sky/server/rest.py
CHANGED
|
@@ -256,6 +256,40 @@ def handle_server_unavailable(response: 'requests.Response') -> None:
|
|
|
256
256
|
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
257
257
|
|
|
258
258
|
|
|
259
|
+
async def handle_server_unavailable_async(
|
|
260
|
+
response: 'aiohttp.ClientResponse') -> None:
|
|
261
|
+
"""Async version: Handle 503 (Service Unavailable) error
|
|
262
|
+
|
|
263
|
+
The client get 503 error in the following cases:
|
|
264
|
+
1. The reverse proxy cannot find any ready backend endpoints to serve the
|
|
265
|
+
request, e.g. when there is and rolling-update.
|
|
266
|
+
2. The skypilot API server has temporary resource issue, e.g. when the
|
|
267
|
+
cucurrency of the handling process is exhausted.
|
|
268
|
+
|
|
269
|
+
We expect the caller (CLI or SDK) retry on these cases and show clear wait
|
|
270
|
+
message to the user to let user decide whether keep waiting or abort the
|
|
271
|
+
request.
|
|
272
|
+
"""
|
|
273
|
+
if response.status != 503:
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
error_msg = ''
|
|
277
|
+
try:
|
|
278
|
+
response_data = await response.json()
|
|
279
|
+
if 'detail' in response_data:
|
|
280
|
+
error_msg = response_data['detail']
|
|
281
|
+
except Exception: # pylint: disable=broad-except
|
|
282
|
+
try:
|
|
283
|
+
text = await response.text()
|
|
284
|
+
if text:
|
|
285
|
+
error_msg = text
|
|
286
|
+
except Exception: # pylint: disable=broad-except
|
|
287
|
+
pass
|
|
288
|
+
|
|
289
|
+
with ux_utils.print_exception_no_traceback():
|
|
290
|
+
raise exceptions.ServerTemporarilyUnavailableError(error_msg)
|
|
291
|
+
|
|
292
|
+
|
|
259
293
|
@_retry_on_server_unavailable()
|
|
260
294
|
def request(method, url, **kwargs) -> 'requests.Response':
|
|
261
295
|
"""Send a request to the API server, retry on server temporarily
|
|
@@ -332,7 +366,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
|
|
|
332
366
|
response = await session.request(method, url, **kwargs)
|
|
333
367
|
|
|
334
368
|
# Handle server unavailability (503 status) - same as sync version
|
|
335
|
-
|
|
369
|
+
await handle_server_unavailable_async(response)
|
|
336
370
|
|
|
337
371
|
# Set remote API version and version from headers - same as sync version
|
|
338
372
|
remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
|
sky/server/server.py
CHANGED
|
@@ -163,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
163
163
|
"""Middleware to add a request ID to each request."""
|
|
164
164
|
|
|
165
165
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
166
|
-
request_id =
|
|
166
|
+
request_id = requests_lib.get_new_request_id()
|
|
167
167
|
request.state.request_id = request_id
|
|
168
168
|
response = await call_next(request)
|
|
169
169
|
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
@@ -455,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
455
455
|
loop.call_at(target, tick)
|
|
456
456
|
|
|
457
457
|
|
|
458
|
-
def
|
|
458
|
+
async def schedule_on_boot_check_async():
|
|
459
459
|
try:
|
|
460
|
-
executor.
|
|
460
|
+
await executor.schedule_request_async(
|
|
461
461
|
request_id='skypilot-server-on-boot-check',
|
|
462
462
|
request_name='check',
|
|
463
463
|
request_body=payloads.CheckBody(),
|
|
@@ -480,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
480
480
|
if event.should_skip():
|
|
481
481
|
continue
|
|
482
482
|
try:
|
|
483
|
-
executor.
|
|
483
|
+
await executor.schedule_request_async(
|
|
484
484
|
request_id=event.id,
|
|
485
485
|
request_name=event.name,
|
|
486
486
|
request_body=payloads.RequestBody(),
|
|
@@ -495,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
495
495
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
496
496
|
# can safely ignore the error if the task is already scheduled.
|
|
497
497
|
logger.debug(f'Request {event.id} already exists.')
|
|
498
|
-
|
|
498
|
+
await schedule_on_boot_check_async()
|
|
499
499
|
asyncio.create_task(cleanup_upload_ids())
|
|
500
500
|
if metrics_utils.METRICS_ENABLED:
|
|
501
501
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -729,7 +729,7 @@ async def token(request: fastapi.Request,
|
|
|
729
729
|
async def check(request: fastapi.Request,
|
|
730
730
|
check_body: payloads.CheckBody) -> None:
|
|
731
731
|
"""Checks enabled clouds."""
|
|
732
|
-
executor.
|
|
732
|
+
await executor.schedule_request_async(
|
|
733
733
|
request_id=request.state.request_id,
|
|
734
734
|
request_name='check',
|
|
735
735
|
request_body=check_body,
|
|
@@ -743,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
743
743
|
workspace: Optional[str] = None,
|
|
744
744
|
expand: bool = False) -> None:
|
|
745
745
|
"""Gets enabled clouds on the server."""
|
|
746
|
-
executor.
|
|
746
|
+
await executor.schedule_request_async(
|
|
747
747
|
request_id=request.state.request_id,
|
|
748
748
|
request_name='enabled_clouds',
|
|
749
749
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
@@ -759,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
759
759
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
760
760
|
) -> None:
|
|
761
761
|
"""Gets real-time Kubernetes GPU availability."""
|
|
762
|
-
executor.
|
|
762
|
+
await executor.schedule_request_async(
|
|
763
763
|
request_id=request.state.request_id,
|
|
764
764
|
request_name='realtime_kubernetes_gpu_availability',
|
|
765
765
|
request_body=realtime_gpu_availability_body,
|
|
@@ -774,7 +774,7 @@ async def kubernetes_node_info(
|
|
|
774
774
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
775
775
|
) -> None:
|
|
776
776
|
"""Gets Kubernetes nodes information and hints."""
|
|
777
|
-
executor.
|
|
777
|
+
await executor.schedule_request_async(
|
|
778
778
|
request_id=request.state.request_id,
|
|
779
779
|
request_name='kubernetes_node_info',
|
|
780
780
|
request_body=kubernetes_node_info_body,
|
|
@@ -786,7 +786,7 @@ async def kubernetes_node_info(
|
|
|
786
786
|
@app.get('/status_kubernetes')
|
|
787
787
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
788
788
|
"""Gets Kubernetes status."""
|
|
789
|
-
executor.
|
|
789
|
+
await executor.schedule_request_async(
|
|
790
790
|
request_id=request.state.request_id,
|
|
791
791
|
request_name='status_kubernetes',
|
|
792
792
|
request_body=payloads.RequestBody(),
|
|
@@ -800,7 +800,7 @@ async def list_accelerators(
|
|
|
800
800
|
request: fastapi.Request,
|
|
801
801
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
802
802
|
"""Gets list of accelerators from cloud catalog."""
|
|
803
|
-
executor.
|
|
803
|
+
await executor.schedule_request_async(
|
|
804
804
|
request_id=request.state.request_id,
|
|
805
805
|
request_name='list_accelerators',
|
|
806
806
|
request_body=list_accelerator_counts_body,
|
|
@@ -815,7 +815,7 @@ async def list_accelerator_counts(
|
|
|
815
815
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
816
816
|
) -> None:
|
|
817
817
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
818
|
-
executor.
|
|
818
|
+
await executor.schedule_request_async(
|
|
819
819
|
request_id=request.state.request_id,
|
|
820
820
|
request_name='list_accelerator_counts',
|
|
821
821
|
request_body=list_accelerator_counts_body,
|
|
@@ -872,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
872
872
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
873
873
|
request: fastapi.Request) -> None:
|
|
874
874
|
"""Optimizes the user's DAG."""
|
|
875
|
-
executor.
|
|
875
|
+
await executor.schedule_request_async(
|
|
876
876
|
request_id=request.state.request_id,
|
|
877
877
|
request_name='optimize',
|
|
878
878
|
request_body=optimize_body,
|
|
@@ -1082,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1082
1082
|
"""Launches a cluster or task."""
|
|
1083
1083
|
request_id = request.state.request_id
|
|
1084
1084
|
logger.info(f'Launching request: {request_id}')
|
|
1085
|
-
executor.
|
|
1085
|
+
await executor.schedule_request_async(
|
|
1086
1086
|
request_id,
|
|
1087
1087
|
request_name='launch',
|
|
1088
1088
|
request_body=launch_body,
|
|
@@ -1098,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1098
1098
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1099
1099
|
"""Executes a task on an existing cluster."""
|
|
1100
1100
|
cluster_name = exec_body.cluster_name
|
|
1101
|
-
executor.
|
|
1101
|
+
await executor.schedule_request_async(
|
|
1102
1102
|
request_id=request.state.request_id,
|
|
1103
1103
|
request_name='exec',
|
|
1104
1104
|
request_body=exec_body,
|
|
@@ -1116,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1116
1116
|
async def stop(request: fastapi.Request,
|
|
1117
1117
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1118
1118
|
"""Stops a cluster."""
|
|
1119
|
-
executor.
|
|
1119
|
+
await executor.schedule_request_async(
|
|
1120
1120
|
request_id=request.state.request_id,
|
|
1121
1121
|
request_name='stop',
|
|
1122
1122
|
request_body=stop_body,
|
|
@@ -1136,7 +1136,7 @@ async def status(
|
|
|
1136
1136
|
raise fastapi.HTTPException(
|
|
1137
1137
|
status_code=503,
|
|
1138
1138
|
detail='Server is shutting down, please try again later.')
|
|
1139
|
-
executor.
|
|
1139
|
+
await executor.schedule_request_async(
|
|
1140
1140
|
request_id=request.state.request_id,
|
|
1141
1141
|
request_name='status',
|
|
1142
1142
|
request_body=status_body,
|
|
@@ -1151,7 +1151,7 @@ async def status(
|
|
|
1151
1151
|
async def endpoints(request: fastapi.Request,
|
|
1152
1152
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1153
1153
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1154
|
-
executor.
|
|
1154
|
+
await executor.schedule_request_async(
|
|
1155
1155
|
request_id=request.state.request_id,
|
|
1156
1156
|
request_name='endpoints',
|
|
1157
1157
|
request_body=endpoint_body,
|
|
@@ -1165,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
|
|
|
1165
1165
|
async def down(request: fastapi.Request,
|
|
1166
1166
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1167
1167
|
"""Tears down a cluster."""
|
|
1168
|
-
executor.
|
|
1168
|
+
await executor.schedule_request_async(
|
|
1169
1169
|
request_id=request.state.request_id,
|
|
1170
1170
|
request_name='down',
|
|
1171
1171
|
request_body=down_body,
|
|
@@ -1179,7 +1179,7 @@ async def down(request: fastapi.Request,
|
|
|
1179
1179
|
async def start(request: fastapi.Request,
|
|
1180
1180
|
start_body: payloads.StartBody) -> None:
|
|
1181
1181
|
"""Restarts a cluster."""
|
|
1182
|
-
executor.
|
|
1182
|
+
await executor.schedule_request_async(
|
|
1183
1183
|
request_id=request.state.request_id,
|
|
1184
1184
|
request_name='start',
|
|
1185
1185
|
request_body=start_body,
|
|
@@ -1193,7 +1193,7 @@ async def start(request: fastapi.Request,
|
|
|
1193
1193
|
async def autostop(request: fastapi.Request,
|
|
1194
1194
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1195
1195
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1196
|
-
executor.
|
|
1196
|
+
await executor.schedule_request_async(
|
|
1197
1197
|
request_id=request.state.request_id,
|
|
1198
1198
|
request_name='autostop',
|
|
1199
1199
|
request_body=autostop_body,
|
|
@@ -1207,7 +1207,7 @@ async def autostop(request: fastapi.Request,
|
|
|
1207
1207
|
async def queue(request: fastapi.Request,
|
|
1208
1208
|
queue_body: payloads.QueueBody) -> None:
|
|
1209
1209
|
"""Gets the job queue of a cluster."""
|
|
1210
|
-
executor.
|
|
1210
|
+
await executor.schedule_request_async(
|
|
1211
1211
|
request_id=request.state.request_id,
|
|
1212
1212
|
request_name='queue',
|
|
1213
1213
|
request_body=queue_body,
|
|
@@ -1221,7 +1221,7 @@ async def queue(request: fastapi.Request,
|
|
|
1221
1221
|
async def job_status(request: fastapi.Request,
|
|
1222
1222
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1223
1223
|
"""Gets the status of a job."""
|
|
1224
|
-
executor.
|
|
1224
|
+
await executor.schedule_request_async(
|
|
1225
1225
|
request_id=request.state.request_id,
|
|
1226
1226
|
request_name='job_status',
|
|
1227
1227
|
request_body=job_status_body,
|
|
@@ -1235,7 +1235,7 @@ async def job_status(request: fastapi.Request,
|
|
|
1235
1235
|
async def cancel(request: fastapi.Request,
|
|
1236
1236
|
cancel_body: payloads.CancelBody) -> None:
|
|
1237
1237
|
"""Cancels jobs on a cluster."""
|
|
1238
|
-
executor.
|
|
1238
|
+
await executor.schedule_request_async(
|
|
1239
1239
|
request_id=request.state.request_id,
|
|
1240
1240
|
request_name='cancel',
|
|
1241
1241
|
request_body=cancel_body,
|
|
@@ -1255,7 +1255,7 @@ async def logs(
|
|
|
1255
1255
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1256
1256
|
# request status.
|
|
1257
1257
|
executor.check_request_thread_executor_available()
|
|
1258
|
-
request_task = executor.
|
|
1258
|
+
request_task = await executor.prepare_request_async(
|
|
1259
1259
|
request_id=request.state.request_id,
|
|
1260
1260
|
request_name='logs',
|
|
1261
1261
|
request_body=cluster_job_body,
|
|
@@ -1286,7 +1286,7 @@ async def download_logs(
|
|
|
1286
1286
|
# We should reuse the original request body, so that the env vars, such as
|
|
1287
1287
|
# user hash, are kept the same.
|
|
1288
1288
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1289
|
-
executor.
|
|
1289
|
+
await executor.schedule_request_async(
|
|
1290
1290
|
request_id=request.state.request_id,
|
|
1291
1291
|
request_name='download_logs',
|
|
1292
1292
|
request_body=cluster_jobs_body,
|
|
@@ -1437,7 +1437,7 @@ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
|
1437
1437
|
async def cost_report(request: fastapi.Request,
|
|
1438
1438
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1439
1439
|
"""Gets the cost report of a cluster."""
|
|
1440
|
-
executor.
|
|
1440
|
+
await executor.schedule_request_async(
|
|
1441
1441
|
request_id=request.state.request_id,
|
|
1442
1442
|
request_name='cost_report',
|
|
1443
1443
|
request_body=cost_report_body,
|
|
@@ -1449,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
|
|
|
1449
1449
|
@app.get('/storage/ls')
|
|
1450
1450
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1451
1451
|
"""Gets the storages."""
|
|
1452
|
-
executor.
|
|
1452
|
+
await executor.schedule_request_async(
|
|
1453
1453
|
request_id=request.state.request_id,
|
|
1454
1454
|
request_name='storage_ls',
|
|
1455
1455
|
request_body=payloads.RequestBody(),
|
|
@@ -1462,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1462
1462
|
async def storage_delete(request: fastapi.Request,
|
|
1463
1463
|
storage_body: payloads.StorageBody) -> None:
|
|
1464
1464
|
"""Deletes a storage."""
|
|
1465
|
-
executor.
|
|
1465
|
+
await executor.schedule_request_async(
|
|
1466
1466
|
request_id=request.state.request_id,
|
|
1467
1467
|
request_name='storage_delete',
|
|
1468
1468
|
request_body=storage_body,
|
|
@@ -1475,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1475
1475
|
async def local_up(request: fastapi.Request,
|
|
1476
1476
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1477
1477
|
"""Launches a Kubernetes cluster on API server."""
|
|
1478
|
-
executor.
|
|
1478
|
+
await executor.schedule_request_async(
|
|
1479
1479
|
request_id=request.state.request_id,
|
|
1480
1480
|
request_name='local_up',
|
|
1481
1481
|
request_body=local_up_body,
|
|
@@ -1488,7 +1488,7 @@ async def local_up(request: fastapi.Request,
|
|
|
1488
1488
|
async def local_down(request: fastapi.Request,
|
|
1489
1489
|
local_down_body: payloads.LocalDownBody) -> None:
|
|
1490
1490
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1491
|
-
executor.
|
|
1491
|
+
await executor.schedule_request_async(
|
|
1492
1492
|
request_id=request.state.request_id,
|
|
1493
1493
|
request_name='local_down',
|
|
1494
1494
|
request_body=local_down_body,
|
|
@@ -1566,7 +1566,7 @@ async def stream(
|
|
|
1566
1566
|
detail='Only one of request_id and log_path can be provided')
|
|
1567
1567
|
|
|
1568
1568
|
if request_id is None and log_path is None:
|
|
1569
|
-
request_id = requests_lib.
|
|
1569
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1570
1570
|
if request_id is None:
|
|
1571
1571
|
raise fastapi.HTTPException(status_code=404,
|
|
1572
1572
|
detail='No request found')
|
|
@@ -1672,7 +1672,7 @@ async def stream(
|
|
|
1672
1672
|
async def api_cancel(request: fastapi.Request,
|
|
1673
1673
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1674
1674
|
"""Cancels requests."""
|
|
1675
|
-
executor.
|
|
1675
|
+
await executor.schedule_request_async(
|
|
1676
1676
|
request_id=request.state.request_id,
|
|
1677
1677
|
request_name='api_cancel',
|
|
1678
1678
|
request_body=request_cancel_body,
|
|
@@ -1908,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1908
1908
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1909
1909
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1910
1910
|
|
|
1911
|
-
executor.
|
|
1911
|
+
await executor.schedule_request_async(
|
|
1912
1912
|
request_id=request.state.request_id,
|
|
1913
1913
|
request_name='all_contexts',
|
|
1914
1914
|
request_body=payloads.RequestBody(),
|
sky/setup_files/alembic.ini
CHANGED
|
@@ -98,6 +98,10 @@ version_table = alembic_version_spot_jobs_db
|
|
|
98
98
|
version_locations = %(here)s/../schemas/db/serve_state
|
|
99
99
|
version_table = alembic_version_serve_state_db
|
|
100
100
|
|
|
101
|
+
[sky_config_db]
|
|
102
|
+
version_locations = %(here)s/../schemas/db/skypilot_config
|
|
103
|
+
version_table = alembic_version_sky_config_db
|
|
104
|
+
|
|
101
105
|
[post_write_hooks]
|
|
102
106
|
# post_write_hooks defines scripts or Python functions that are run
|
|
103
107
|
# on newly generated revision scripts. See the documentation for further
|