skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/backends/backend_utils.py +11 -11
- sky/backends/cloud_vm_ray_backend.py +15 -4
- sky/client/cli/command.py +39 -10
- sky/client/cli/flags.py +4 -2
- sky/client/sdk.py +26 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +2 -2
- sky/global_user_state.py +137 -37
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +21 -12
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/provision/kubernetes/network.py +9 -6
- sky/provision/provisioner.py +8 -0
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +8 -7
- sky/server/common.py +10 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +4 -2
- sky/server/requests/executor.py +30 -28
- sky/server/requests/payloads.py +5 -1
- sky/server/requests/preconditions.py +9 -4
- sky/server/requests/requests.py +130 -53
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +91 -58
- sky/server/stream_utils.py +127 -38
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/asyncio_utils.py +63 -3
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
- sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
- /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/common.py
CHANGED
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
-
from urllib import parse
|
|
21
20
|
import uuid
|
|
22
21
|
|
|
23
22
|
import cachetools
|
|
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
342
341
|
@annotations.lru_cache(scope='global')
|
|
343
342
|
def get_dashboard_url(server_url: str,
|
|
344
343
|
starting_page: Optional[str] = None) -> str:
|
|
345
|
-
|
|
346
|
-
# format of https://username:password@example.com:8080/path
|
|
347
|
-
# We need to remove the username and password and only
|
|
348
|
-
# return `https://example.com:8080/path`
|
|
349
|
-
parsed = parse.urlparse(server_url)
|
|
350
|
-
# Reconstruct the URL without credentials but keeping the scheme
|
|
351
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
352
|
-
if parsed.port:
|
|
353
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
354
|
-
if parsed.path:
|
|
355
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
356
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
357
345
|
dashboard_url = f'{dashboard_url}/dashboard'
|
|
358
346
|
if starting_page:
|
|
359
347
|
dashboard_url = f'{dashboard_url}/{starting_page}'
|
|
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
490
478
|
def handle_request_error(response: 'requests.Response') -> None:
|
|
491
479
|
# Keep the original HTTPError if the response code >= 400
|
|
492
480
|
response.raise_for_status()
|
|
481
|
+
|
|
493
482
|
# Other status codes are not expected neither, e.g. we do not expect to
|
|
494
483
|
# handle redirection here.
|
|
495
484
|
if response.status_code != 200:
|
|
@@ -916,12 +905,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
|
916
905
|
client_command: Optional[str],
|
|
917
906
|
using_remote_api_server: bool, user: 'models.User',
|
|
918
907
|
request_id: str) -> None:
|
|
919
|
-
"""Reload modules, global variables, and usage message for a new request.
|
|
908
|
+
"""Reload modules, global variables, and usage message for a new request.
|
|
909
|
+
|
|
910
|
+
Must be called within the request's context.
|
|
911
|
+
"""
|
|
920
912
|
# This should be called first to make sure the logger is up-to-date.
|
|
921
913
|
sky_logging.reload_logger()
|
|
922
914
|
|
|
923
915
|
# Reload the skypilot config to make sure the latest config is used.
|
|
924
|
-
|
|
916
|
+
# We don't need to grab the lock here because this function is only
|
|
917
|
+
# run once we are inside the request's context, so there shouldn't
|
|
918
|
+
# be any race conditions when reloading the config.
|
|
919
|
+
skypilot_config.reload_config()
|
|
925
920
|
|
|
926
921
|
# Reset the client entrypoint and command for the usage message.
|
|
927
922
|
common_utils.set_request_context(
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 21
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
CHANGED
|
@@ -38,9 +38,11 @@ class InternalRequestDaemon:
|
|
|
38
38
|
try:
|
|
39
39
|
# Refresh config within the while loop.
|
|
40
40
|
# Since this is a long running daemon,
|
|
41
|
-
#
|
|
41
|
+
# reload_for_new_request()
|
|
42
42
|
# is not called in between the event runs.
|
|
43
|
-
|
|
43
|
+
# We don't need to grab the lock here because each of the daemons
|
|
44
|
+
# run in their own process and thus have their own request context.
|
|
45
|
+
skypilot_config.reload_config()
|
|
44
46
|
# Get the configured log level for the daemon inside the event loop
|
|
45
47
|
# in case the log level changes after the API server is started.
|
|
46
48
|
level_str = skypilot_config.get_nested(
|
sky/server/requests/executor.py
CHANGED
|
@@ -214,10 +214,11 @@ class RequestWorker:
|
|
|
214
214
|
time.sleep(0.1)
|
|
215
215
|
return
|
|
216
216
|
request_id, ignore_return_value, _ = request_element
|
|
217
|
-
request = api_requests.get_request(request_id)
|
|
217
|
+
request = api_requests.get_request(request_id, fields=['status'])
|
|
218
218
|
assert request is not None, f'Request with ID {request_id} is None'
|
|
219
219
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
220
220
|
return
|
|
221
|
+
del request
|
|
221
222
|
logger.info(f'[{self}] Submitting request: {request_id}')
|
|
222
223
|
# Start additional process to run the request, so that it can be
|
|
223
224
|
# cancelled when requested by a user.
|
|
@@ -328,10 +329,7 @@ def override_request_env_and_config(
|
|
|
328
329
|
# through the execution.
|
|
329
330
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
330
331
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
331
|
-
global_user_state.add_or_update_user(user)
|
|
332
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
333
|
-
# field.
|
|
334
|
-
user = global_user_state.get_user(user.id)
|
|
332
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
335
333
|
|
|
336
334
|
# Force color to be enabled.
|
|
337
335
|
os.environ['CLICOLOR_FORCE'] = '1'
|
|
@@ -621,8 +619,8 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
621
619
|
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
622
620
|
func = request.entrypoint
|
|
623
621
|
request_body = request.request_body
|
|
624
|
-
|
|
625
|
-
|
|
622
|
+
await api_requests.update_status_async(request.request_id,
|
|
623
|
+
api_requests.RequestStatus.RUNNING)
|
|
626
624
|
# Redirect stdout and stderr to the request log path.
|
|
627
625
|
original_output = ctx.redirect_log(request.log_path)
|
|
628
626
|
try:
|
|
@@ -632,7 +630,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
632
630
|
**request_body.to_kwargs())
|
|
633
631
|
except Exception as e: # pylint: disable=broad-except
|
|
634
632
|
ctx.redirect_log(original_output)
|
|
635
|
-
api_requests.
|
|
633
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
636
634
|
logger.error(f'Failed to run request {request.request_id} due to '
|
|
637
635
|
f'{common_utils.format_exception(e)}')
|
|
638
636
|
return
|
|
@@ -649,14 +647,15 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
649
647
|
if fut.done():
|
|
650
648
|
try:
|
|
651
649
|
result = await fut
|
|
652
|
-
api_requests.
|
|
650
|
+
await api_requests.set_request_succeeded_async(
|
|
651
|
+
request_id, result)
|
|
653
652
|
except asyncio.CancelledError:
|
|
654
653
|
# The task is cancelled by ctx.cancel(), where the status
|
|
655
654
|
# should already be set to CANCELLED.
|
|
656
655
|
pass
|
|
657
656
|
except Exception as e: # pylint: disable=broad-except
|
|
658
657
|
ctx.redirect_log(original_output)
|
|
659
|
-
api_requests.
|
|
658
|
+
await api_requests.set_request_failed_async(request_id, e)
|
|
660
659
|
logger.error(f'Request {request_id} failed due to '
|
|
661
660
|
f'{common_utils.format_exception(e)}')
|
|
662
661
|
return True
|
|
@@ -671,13 +670,13 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
671
670
|
except asyncio.CancelledError:
|
|
672
671
|
# Current coroutine is cancelled due to client disconnect, set the
|
|
673
672
|
# request status for consistency.
|
|
674
|
-
api_requests.
|
|
673
|
+
await api_requests.set_request_cancelled_async(request.request_id)
|
|
675
674
|
pass
|
|
676
675
|
# pylint: disable=broad-except
|
|
677
676
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
678
677
|
# Handle any other error
|
|
679
678
|
ctx.redirect_log(original_output)
|
|
680
|
-
api_requests.
|
|
679
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
681
680
|
logger.error(f'Request {request.request_id} interrupted due to '
|
|
682
681
|
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
683
682
|
raise
|
|
@@ -687,7 +686,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
687
686
|
ctx.cancel()
|
|
688
687
|
|
|
689
688
|
|
|
690
|
-
def
|
|
689
|
+
async def prepare_request_async(
|
|
691
690
|
request_id: str,
|
|
692
691
|
request_name: str,
|
|
693
692
|
request_body: payloads.RequestBody,
|
|
@@ -713,7 +712,7 @@ def prepare_request(
|
|
|
713
712
|
user_id=user_id,
|
|
714
713
|
cluster_name=request_cluster_name)
|
|
715
714
|
|
|
716
|
-
if not api_requests.
|
|
715
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
717
716
|
raise exceptions.RequestAlreadyExistsError(
|
|
718
717
|
f'Request {request_id} already exists.')
|
|
719
718
|
|
|
@@ -721,17 +720,18 @@ def prepare_request(
|
|
|
721
720
|
return request
|
|
722
721
|
|
|
723
722
|
|
|
724
|
-
def
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
723
|
+
async def schedule_request_async(request_id: str,
|
|
724
|
+
request_name: str,
|
|
725
|
+
request_body: payloads.RequestBody,
|
|
726
|
+
func: Callable[P, Any],
|
|
727
|
+
request_cluster_name: Optional[str] = None,
|
|
728
|
+
ignore_return_value: bool = False,
|
|
729
|
+
schedule_type: api_requests.ScheduleType = (
|
|
730
|
+
api_requests.ScheduleType.LONG),
|
|
731
|
+
is_skypilot_system: bool = False,
|
|
732
|
+
precondition: Optional[
|
|
733
|
+
preconditions.Precondition] = None,
|
|
734
|
+
retryable: bool = False) -> None:
|
|
735
735
|
"""Enqueue a request to the request queue.
|
|
736
736
|
|
|
737
737
|
Args:
|
|
@@ -752,9 +752,11 @@ def schedule_request(request_id: str,
|
|
|
752
752
|
The precondition is waited asynchronously and does not block the
|
|
753
753
|
caller.
|
|
754
754
|
"""
|
|
755
|
-
request_task =
|
|
756
|
-
|
|
757
|
-
|
|
755
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
756
|
+
request_body, func,
|
|
757
|
+
request_cluster_name,
|
|
758
|
+
schedule_type,
|
|
759
|
+
is_skypilot_system)
|
|
758
760
|
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
759
761
|
retryable)
|
|
760
762
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -363,9 +363,10 @@ class CancelBody(RequestBody):
|
|
|
363
363
|
return kwargs
|
|
364
364
|
|
|
365
365
|
|
|
366
|
-
class
|
|
366
|
+
class ProvisionLogsBody(RequestBody):
|
|
367
367
|
"""Cluster node."""
|
|
368
368
|
cluster_name: str
|
|
369
|
+
worker: Optional[int] = None
|
|
369
370
|
|
|
370
371
|
|
|
371
372
|
class ClusterJobBody(RequestBody):
|
|
@@ -541,6 +542,9 @@ class JobsQueueV2Body(RequestBody):
|
|
|
541
542
|
page: Optional[int] = None
|
|
542
543
|
limit: Optional[int] = None
|
|
543
544
|
statuses: Optional[List[str]] = None
|
|
545
|
+
# The fields to return in the response.
|
|
546
|
+
# Refer to the fields in the `class ManagedJobRecord` in `response.py`
|
|
547
|
+
fields: Optional[List[str]] = None
|
|
544
548
|
|
|
545
549
|
|
|
546
550
|
class JobsCancelBody(RequestBody):
|
|
@@ -90,7 +90,7 @@ class Precondition(abc.ABC):
|
|
|
90
90
|
while True:
|
|
91
91
|
if self.timeout > 0 and time.time() - start_time > self.timeout:
|
|
92
92
|
# Cancel the request on timeout.
|
|
93
|
-
api_requests.
|
|
93
|
+
await api_requests.set_request_failed_async(
|
|
94
94
|
self.request_id,
|
|
95
95
|
exceptions.RequestCancelled(
|
|
96
96
|
f'Request {self.request_id} precondition wait timed '
|
|
@@ -98,13 +98,15 @@ class Precondition(abc.ABC):
|
|
|
98
98
|
return False
|
|
99
99
|
|
|
100
100
|
# Check if the request has been cancelled
|
|
101
|
-
request = await api_requests.get_request_async(self.request_id
|
|
101
|
+
request = await api_requests.get_request_async(self.request_id,
|
|
102
|
+
fields=['status'])
|
|
102
103
|
if request is None:
|
|
103
104
|
logger.error(f'Request {self.request_id} not found')
|
|
104
105
|
return False
|
|
105
106
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
106
107
|
logger.debug(f'Request {self.request_id} cancelled')
|
|
107
108
|
return False
|
|
109
|
+
del request
|
|
108
110
|
|
|
109
111
|
try:
|
|
110
112
|
met, status_msg = await self.check()
|
|
@@ -116,7 +118,7 @@ class Precondition(abc.ABC):
|
|
|
116
118
|
self.request_id, status_msg)
|
|
117
119
|
last_status_msg = status_msg
|
|
118
120
|
except (Exception, SystemExit, KeyboardInterrupt) as e: # pylint: disable=broad-except
|
|
119
|
-
api_requests.
|
|
121
|
+
await api_requests.set_request_failed_async(self.request_id, e)
|
|
120
122
|
logger.info(f'Request {self.request_id} failed due to '
|
|
121
123
|
f'{common_utils.format_exception(e)}')
|
|
122
124
|
return False
|
|
@@ -166,7 +168,10 @@ class ClusterStartCompletePrecondition(Precondition):
|
|
|
166
168
|
api_requests.RequestStatus.RUNNING
|
|
167
169
|
],
|
|
168
170
|
include_request_names=['sky.launch', 'sky.start'],
|
|
169
|
-
cluster_names=[self.cluster_name]
|
|
171
|
+
cluster_names=[self.cluster_name],
|
|
172
|
+
# Only get the request ID to avoid fetching the whole request.
|
|
173
|
+
# We're only interested in the count, not the whole request.
|
|
174
|
+
fields=['request_id']))
|
|
170
175
|
if len(requests) == 0:
|
|
171
176
|
# No running or pending tasks, the start process is done.
|
|
172
177
|
return True, None
|
sky/server/requests/requests.py
CHANGED
|
@@ -16,6 +16,7 @@ import time
|
|
|
16
16
|
import traceback
|
|
17
17
|
from typing import (Any, Callable, Dict, Generator, List, NamedTuple, Optional,
|
|
18
18
|
Tuple)
|
|
19
|
+
import uuid
|
|
19
20
|
|
|
20
21
|
import anyio
|
|
21
22
|
import colorama
|
|
@@ -293,6 +294,11 @@ class Request:
|
|
|
293
294
|
raise
|
|
294
295
|
|
|
295
296
|
|
|
297
|
+
def get_new_request_id() -> str:
|
|
298
|
+
"""Get a new request ID."""
|
|
299
|
+
return str(uuid.uuid4())
|
|
300
|
+
|
|
301
|
+
|
|
296
302
|
def encode_requests(requests: List[Request]) -> List[payloads.RequestPayload]:
|
|
297
303
|
"""Serialize the SkyPilot API request for display purposes.
|
|
298
304
|
|
|
@@ -400,7 +406,8 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
400
406
|
for request_task in get_request_tasks(req_filter=RequestTaskFilter(
|
|
401
407
|
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
402
408
|
exclude_request_names=[exclude_request_name],
|
|
403
|
-
cluster_names=[cluster_name]
|
|
409
|
+
cluster_names=[cluster_name],
|
|
410
|
+
fields=['request_id']))
|
|
404
411
|
]
|
|
405
412
|
kill_requests(request_ids)
|
|
406
413
|
|
|
@@ -425,7 +432,8 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
425
432
|
status=[RequestStatus.PENDING, RequestStatus.RUNNING],
|
|
426
433
|
# Avoid cancelling the cancel request itself.
|
|
427
434
|
exclude_request_names=['sky.api_cancel'],
|
|
428
|
-
user_id=user_id
|
|
435
|
+
user_id=user_id,
|
|
436
|
+
fields=['request_id']))
|
|
429
437
|
]
|
|
430
438
|
cancelled_request_ids = []
|
|
431
439
|
for request_id in request_ids:
|
|
@@ -592,6 +600,18 @@ def update_request(request_id: str) -> Generator[Optional[Request], None, None]:
|
|
|
592
600
|
_add_or_update_request_no_lock(request)
|
|
593
601
|
|
|
594
602
|
|
|
603
|
+
@init_db
|
|
604
|
+
@metrics_lib.time_me
|
|
605
|
+
@asyncio_utils.shield
|
|
606
|
+
async def update_status_async(request_id: str, status: RequestStatus) -> None:
|
|
607
|
+
"""Update the status of a request"""
|
|
608
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
609
|
+
request = await _get_request_no_lock_async(request_id)
|
|
610
|
+
if request is not None:
|
|
611
|
+
request.status = status
|
|
612
|
+
await _add_or_update_request_no_lock_async(request)
|
|
613
|
+
|
|
614
|
+
|
|
595
615
|
@init_db
|
|
596
616
|
@metrics_lib.time_me
|
|
597
617
|
@asyncio_utils.shield
|
|
@@ -604,62 +624,75 @@ async def update_status_msg_async(request_id: str, status_msg: str) -> None:
|
|
|
604
624
|
await _add_or_update_request_no_lock_async(request)
|
|
605
625
|
|
|
606
626
|
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
def _get_request_no_lock(request_id: str) -> Optional[Request]:
|
|
627
|
+
def _get_request_no_lock(
|
|
628
|
+
request_id: str,
|
|
629
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
612
630
|
"""Get a SkyPilot API request."""
|
|
613
631
|
assert _DB is not None
|
|
632
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
633
|
+
if fields:
|
|
634
|
+
columns_str = ', '.join(fields)
|
|
614
635
|
with _DB.conn:
|
|
615
636
|
cursor = _DB.conn.cursor()
|
|
616
|
-
cursor.execute(
|
|
637
|
+
cursor.execute((f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
638
|
+
'WHERE request_id LIKE ?'), (request_id + '%',))
|
|
617
639
|
row = cursor.fetchone()
|
|
618
640
|
if row is None:
|
|
619
641
|
return None
|
|
642
|
+
if fields:
|
|
643
|
+
row = _update_request_row_fields(row, fields)
|
|
620
644
|
return Request.from_row(row)
|
|
621
645
|
|
|
622
646
|
|
|
623
|
-
async def _get_request_no_lock_async(
|
|
647
|
+
async def _get_request_no_lock_async(
|
|
648
|
+
request_id: str,
|
|
649
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
624
650
|
"""Async version of _get_request_no_lock."""
|
|
625
651
|
assert _DB is not None
|
|
626
|
-
|
|
627
|
-
|
|
652
|
+
columns_str = ', '.join(REQUEST_COLUMNS)
|
|
653
|
+
if fields:
|
|
654
|
+
columns_str = ', '.join(fields)
|
|
655
|
+
async with _DB.execute_fetchall_async(
|
|
656
|
+
(f'SELECT {columns_str} FROM {REQUEST_TABLE} '
|
|
657
|
+
'WHERE request_id LIKE ?'), (request_id + '%',)) as rows:
|
|
628
658
|
row = rows[0] if rows else None
|
|
629
659
|
if row is None:
|
|
630
660
|
return None
|
|
661
|
+
if fields:
|
|
662
|
+
row = _update_request_row_fields(row, fields)
|
|
631
663
|
return Request.from_row(row)
|
|
632
664
|
|
|
633
665
|
|
|
634
|
-
@
|
|
666
|
+
@init_db_async
|
|
635
667
|
@metrics_lib.time_me
|
|
636
|
-
def
|
|
668
|
+
async def get_latest_request_id_async() -> Optional[str]:
|
|
637
669
|
"""Get the latest request ID."""
|
|
638
670
|
assert _DB is not None
|
|
639
|
-
with _DB.
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
row = cursor.fetchone()
|
|
644
|
-
return row[0] if row else None
|
|
671
|
+
async with _DB.execute_fetchall_async(
|
|
672
|
+
(f'SELECT request_id FROM {REQUEST_TABLE} '
|
|
673
|
+
'ORDER BY created_at DESC LIMIT 1')) as rows:
|
|
674
|
+
return rows[0][0] if rows else None
|
|
645
675
|
|
|
646
676
|
|
|
647
677
|
@init_db
|
|
648
678
|
@metrics_lib.time_me
|
|
649
|
-
def get_request(request_id: str
|
|
679
|
+
def get_request(request_id: str,
|
|
680
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
650
681
|
"""Get a SkyPilot API request."""
|
|
651
682
|
with filelock.FileLock(request_lock_path(request_id)):
|
|
652
|
-
return _get_request_no_lock(request_id)
|
|
683
|
+
return _get_request_no_lock(request_id, fields)
|
|
653
684
|
|
|
654
685
|
|
|
655
686
|
@init_db_async
|
|
656
687
|
@metrics_lib.time_me_async
|
|
657
688
|
@asyncio_utils.shield
|
|
658
|
-
async def get_request_async(
|
|
689
|
+
async def get_request_async(
|
|
690
|
+
request_id: str,
|
|
691
|
+
fields: Optional[List[str]] = None) -> Optional[Request]:
|
|
659
692
|
"""Async version of get_request."""
|
|
660
693
|
# TODO(aylei): figure out how to remove FileLock here to avoid the overhead
|
|
661
694
|
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
662
|
-
return await _get_request_no_lock_async(request_id)
|
|
695
|
+
return await _get_request_no_lock_async(request_id, fields)
|
|
663
696
|
|
|
664
697
|
|
|
665
698
|
class StatusWithMsg(NamedTuple):
|
|
@@ -696,17 +729,6 @@ async def get_request_status_async(
|
|
|
696
729
|
return StatusWithMsg(status, status_msg)
|
|
697
730
|
|
|
698
731
|
|
|
699
|
-
@init_db
|
|
700
|
-
@metrics_lib.time_me
|
|
701
|
-
def create_if_not_exists(request: Request) -> bool:
|
|
702
|
-
"""Create a SkyPilot API request if it does not exist."""
|
|
703
|
-
with filelock.FileLock(request_lock_path(request.request_id)):
|
|
704
|
-
if _get_request_no_lock(request.request_id) is not None:
|
|
705
|
-
return False
|
|
706
|
-
_add_or_update_request_no_lock(request)
|
|
707
|
-
return True
|
|
708
|
-
|
|
709
|
-
|
|
710
732
|
@init_db_async
|
|
711
733
|
@metrics_lib.time_me_async
|
|
712
734
|
@asyncio_utils.shield
|
|
@@ -896,6 +918,23 @@ def set_request_failed(request_id: str, e: BaseException) -> None:
|
|
|
896
918
|
request_task.set_error(e)
|
|
897
919
|
|
|
898
920
|
|
|
921
|
+
@init_db_async
|
|
922
|
+
@metrics_lib.time_me_async
|
|
923
|
+
@asyncio_utils.shield
|
|
924
|
+
async def set_request_failed_async(request_id: str, e: BaseException) -> None:
|
|
925
|
+
"""Set a request to failed and populate the error message."""
|
|
926
|
+
with ux_utils.enable_traceback():
|
|
927
|
+
stacktrace = traceback.format_exc()
|
|
928
|
+
setattr(e, 'stacktrace', stacktrace)
|
|
929
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
930
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
931
|
+
assert request_task is not None, request_id
|
|
932
|
+
request_task.status = RequestStatus.FAILED
|
|
933
|
+
request_task.finished_at = time.time()
|
|
934
|
+
request_task.set_error(e)
|
|
935
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
936
|
+
|
|
937
|
+
|
|
899
938
|
def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
900
939
|
"""Set a request to succeeded and populate the result."""
|
|
901
940
|
with update_request(request_id) as request_task:
|
|
@@ -906,28 +945,50 @@ def set_request_succeeded(request_id: str, result: Optional[Any]) -> None:
|
|
|
906
945
|
request_task.set_return_value(result)
|
|
907
946
|
|
|
908
947
|
|
|
909
|
-
|
|
948
|
+
@init_db_async
|
|
949
|
+
@metrics_lib.time_me_async
|
|
950
|
+
@asyncio_utils.shield
|
|
951
|
+
async def set_request_succeeded_async(request_id: str,
|
|
952
|
+
result: Optional[Any]) -> None:
|
|
953
|
+
"""Set a request to succeeded and populate the result."""
|
|
954
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
955
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
956
|
+
assert request_task is not None, request_id
|
|
957
|
+
request_task.status = RequestStatus.SUCCEEDED
|
|
958
|
+
request_task.finished_at = time.time()
|
|
959
|
+
if result is not None:
|
|
960
|
+
request_task.set_return_value(result)
|
|
961
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
@init_db_async
|
|
965
|
+
@metrics_lib.time_me_async
|
|
966
|
+
@asyncio_utils.shield
|
|
967
|
+
async def set_request_cancelled_async(request_id: str) -> None:
|
|
910
968
|
"""Set a pending or running request to cancelled."""
|
|
911
|
-
with
|
|
969
|
+
async with filelock.AsyncFileLock(request_lock_path(request_id)):
|
|
970
|
+
request_task = await _get_request_no_lock_async(request_id)
|
|
912
971
|
assert request_task is not None, request_id
|
|
913
972
|
# Already finished or cancelled.
|
|
914
973
|
if request_task.status > RequestStatus.RUNNING:
|
|
915
974
|
return
|
|
916
975
|
request_task.finished_at = time.time()
|
|
917
976
|
request_task.status = RequestStatus.CANCELLED
|
|
977
|
+
await _add_or_update_request_no_lock_async(request_task)
|
|
918
978
|
|
|
919
979
|
|
|
920
980
|
@init_db
|
|
921
981
|
@metrics_lib.time_me
|
|
922
|
-
async def _delete_requests(
|
|
982
|
+
async def _delete_requests(request_ids: List[str]):
|
|
923
983
|
"""Clean up requests by their IDs."""
|
|
924
|
-
id_list_str = ','.join(repr(
|
|
984
|
+
id_list_str = ','.join(repr(request_id) for request_id in request_ids)
|
|
925
985
|
assert _DB is not None
|
|
926
986
|
await _DB.execute_and_commit_async(
|
|
927
987
|
f'DELETE FROM {REQUEST_TABLE} WHERE request_id IN ({id_list_str})')
|
|
928
988
|
|
|
929
989
|
|
|
930
|
-
async def clean_finished_requests_with_retention(retention_seconds: int
|
|
990
|
+
async def clean_finished_requests_with_retention(retention_seconds: int,
|
|
991
|
+
batch_size: int = 1000):
|
|
931
992
|
"""Clean up finished requests older than the retention period.
|
|
932
993
|
|
|
933
994
|
This function removes old finished requests (SUCCEEDED, FAILED, CANCELLED)
|
|
@@ -936,24 +997,40 @@ async def clean_finished_requests_with_retention(retention_seconds: int):
|
|
|
936
997
|
Args:
|
|
937
998
|
retention_seconds: Requests older than this many seconds will be
|
|
938
999
|
deleted.
|
|
1000
|
+
batch_size: batch delete 'batch_size' requests at a time to
|
|
1001
|
+
avoid using too much memory and once and to let each
|
|
1002
|
+
db query complete in a reasonable time. All stale
|
|
1003
|
+
requests older than the retention period will be deleted
|
|
1004
|
+
regardless of the batch size.
|
|
939
1005
|
"""
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
1006
|
+
total_deleted = 0
|
|
1007
|
+
while True:
|
|
1008
|
+
reqs = await get_request_tasks_async(
|
|
1009
|
+
req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
|
|
1010
|
+
finished_before=time.time() -
|
|
1011
|
+
retention_seconds,
|
|
1012
|
+
limit=batch_size,
|
|
1013
|
+
fields=['request_id']))
|
|
1014
|
+
if len(reqs) == 0:
|
|
1015
|
+
break
|
|
1016
|
+
futs = []
|
|
1017
|
+
for req in reqs:
|
|
1018
|
+
# req.log_path is derived from request_id,
|
|
1019
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1020
|
+
futs.append(
|
|
1021
|
+
asyncio.create_task(
|
|
1022
|
+
anyio.Path(
|
|
1023
|
+
req.log_path.absolute()).unlink(missing_ok=True)))
|
|
1024
|
+
await asyncio.gather(*futs)
|
|
1025
|
+
|
|
1026
|
+
await _delete_requests([req.request_id for req in reqs])
|
|
1027
|
+
total_deleted += len(reqs)
|
|
1028
|
+
if len(reqs) < batch_size:
|
|
1029
|
+
break
|
|
953
1030
|
|
|
954
1031
|
# To avoid leakage of the log file, logs must be deleted before the
|
|
955
1032
|
# request task in the database.
|
|
956
|
-
logger.info(f'Cleaned up {
|
|
1033
|
+
logger.info(f'Cleaned up {total_deleted} finished requests '
|
|
957
1034
|
f'older than {retention_seconds} seconds')
|
|
958
1035
|
|
|
959
1036
|
|
|
@@ -121,7 +121,7 @@ def encode_status_kubernetes(
|
|
|
121
121
|
encoded_cluster = dataclasses.asdict(cluster)
|
|
122
122
|
encoded_cluster['status'] = encoded_cluster['status'].value
|
|
123
123
|
encoded_unmanaged_clusters.append(encoded_cluster)
|
|
124
|
-
all_jobs = [job.model_dump() for job in all_jobs]
|
|
124
|
+
all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
|
|
125
125
|
return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
|
|
126
126
|
|
|
127
127
|
|
|
@@ -151,9 +151,9 @@ def encode_jobs_queue_v2(
|
|
|
151
151
|
for job in jobs:
|
|
152
152
|
job['status'] = job['status'].value
|
|
153
153
|
if total is None:
|
|
154
|
-
return [job.model_dump() for job in jobs]
|
|
154
|
+
return [job.model_dump(by_alias=True) for job in jobs]
|
|
155
155
|
return {
|
|
156
|
-
'jobs': [job.model_dump() for job in jobs],
|
|
156
|
+
'jobs': [job.model_dump(by_alias=True) for job in jobs],
|
|
157
157
|
'total': total,
|
|
158
158
|
'total_no_filter': total_no_filter,
|
|
159
159
|
'status_counts': status_counts
|