skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -126,13 +126,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
126
126
|
|
|
127
127
|
async def _authenticate(self, request: fastapi.Request, call_next,
|
|
128
128
|
session: aiohttp.ClientSession):
|
|
129
|
-
forwarded_headers =
|
|
129
|
+
forwarded_headers = {}
|
|
130
130
|
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
131
131
|
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
132
|
-
|
|
133
|
-
# to reduce the auth overhead.
|
|
134
|
-
forwarded_headers.pop('content-length', None)
|
|
135
|
-
forwarded_headers.pop('content-type', None)
|
|
132
|
+
forwarded_headers['Host'] = request.url.hostname
|
|
136
133
|
logger.debug(f'authenticate request: {auth_url}, '
|
|
137
134
|
f'headers: {forwarded_headers}')
|
|
138
135
|
|
sky/server/common.py
CHANGED
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
-
from urllib import parse
|
|
21
20
|
import uuid
|
|
22
21
|
|
|
23
22
|
import cachetools
|
|
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
342
341
|
@annotations.lru_cache(scope='global')
|
|
343
342
|
def get_dashboard_url(server_url: str,
|
|
344
343
|
starting_page: Optional[str] = None) -> str:
|
|
345
|
-
|
|
346
|
-
# format of https://username:password@example.com:8080/path
|
|
347
|
-
# We need to remove the username and password and only
|
|
348
|
-
# return `https://example.com:8080/path`
|
|
349
|
-
parsed = parse.urlparse(server_url)
|
|
350
|
-
# Reconstruct the URL without credentials but keeping the scheme
|
|
351
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
352
|
-
if parsed.port:
|
|
353
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
354
|
-
if parsed.path:
|
|
355
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
356
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
357
345
|
dashboard_url = f'{dashboard_url}/dashboard'
|
|
358
346
|
if starting_page:
|
|
359
347
|
dashboard_url = f'{dashboard_url}/{starting_page}'
|
|
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
490
478
|
def handle_request_error(response: 'requests.Response') -> None:
|
|
491
479
|
# Keep the original HTTPError if the response code >= 400
|
|
492
480
|
response.raise_for_status()
|
|
481
|
+
|
|
493
482
|
# Other status codes are not expected neither, e.g. we do not expect to
|
|
494
483
|
# handle redirection here.
|
|
495
484
|
if response.status_code != 200:
|
|
@@ -550,19 +539,27 @@ def _start_api_server(deploy: bool = False,
|
|
|
550
539
|
'is not a local URL')
|
|
551
540
|
|
|
552
541
|
# Check available memory before starting the server.
|
|
553
|
-
|
|
554
|
-
#
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
if
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
542
|
+
# Skip this warning if postgres is used, as:
|
|
543
|
+
# 1) that's almost certainly a remote API server;
|
|
544
|
+
# 2) the actual consolidation mode config is stashed in the database,
|
|
545
|
+
# and the value of `job_utils.is_consolidation_mode` will not be
|
|
546
|
+
# the actual value in the db, but only None as in this case, the
|
|
547
|
+
# whole YAML config is really just `db: <URI>`.
|
|
548
|
+
if skypilot_config.get_nested(('db',), None) is None:
|
|
549
|
+
avail_mem_size_gb: float = common_utils.get_mem_size_gb()
|
|
550
|
+
# pylint: disable=import-outside-toplevel
|
|
551
|
+
import sky.jobs.utils as job_utils
|
|
552
|
+
max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
|
|
553
|
+
if job_utils.is_consolidation_mode(
|
|
554
|
+
on_api_restart=True) else
|
|
555
|
+
server_constants.MIN_AVAIL_MEM_GB)
|
|
556
|
+
if avail_mem_size_gb <= max_memory:
|
|
557
|
+
logger.warning(
|
|
558
|
+
f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
|
|
559
|
+
f'only has {avail_mem_size_gb:.1f}GB memory available. '
|
|
560
|
+
f'At least {max_memory}GB is recommended to support higher '
|
|
561
|
+
'load with better performance.'
|
|
562
|
+
f'{colorama.Style.RESET_ALL}')
|
|
566
563
|
|
|
567
564
|
args = [sys.executable, *API_SERVER_CMD.split()]
|
|
568
565
|
if deploy:
|
|
@@ -914,12 +911,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
|
914
911
|
client_command: Optional[str],
|
|
915
912
|
using_remote_api_server: bool, user: 'models.User',
|
|
916
913
|
request_id: str) -> None:
|
|
917
|
-
"""Reload modules, global variables, and usage message for a new request.
|
|
914
|
+
"""Reload modules, global variables, and usage message for a new request.
|
|
915
|
+
|
|
916
|
+
Must be called within the request's context.
|
|
917
|
+
"""
|
|
918
918
|
# This should be called first to make sure the logger is up-to-date.
|
|
919
919
|
sky_logging.reload_logger()
|
|
920
920
|
|
|
921
921
|
# Reload the skypilot config to make sure the latest config is used.
|
|
922
|
-
|
|
922
|
+
# We don't need to grab the lock here because this function is only
|
|
923
|
+
# run once we are inside the request's context, so there shouldn't
|
|
924
|
+
# be any race conditions when reloading the config.
|
|
925
|
+
skypilot_config.reload_config()
|
|
923
926
|
|
|
924
927
|
# Reset the client entrypoint and command for the usage message.
|
|
925
928
|
common_utils.set_request_context(
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 22
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
|
@@ -64,3 +64,7 @@ DAEMON_RESTART_INTERVAL_SECONDS = 20
|
|
|
64
64
|
|
|
65
65
|
# Cookie header for stream request id.
|
|
66
66
|
STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
|
|
67
|
+
|
|
68
|
+
# Valid empty values for pickled fields (base64-encoded pickled None)
|
|
69
|
+
# base64.b64encode(pickle.dumps(None)).decode('utf-8')
|
|
70
|
+
EMPTY_PICKLED_VALUE = 'gAROLg=='
|
sky/server/daemons.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Callable
|
|
|
7
7
|
from sky import sky_logging
|
|
8
8
|
from sky import skypilot_config
|
|
9
9
|
from sky.server import constants as server_constants
|
|
10
|
+
from sky.server.requests import request_names
|
|
10
11
|
from sky.utils import annotations
|
|
11
12
|
from sky.utils import common_utils
|
|
12
13
|
from sky.utils import env_options
|
|
@@ -26,7 +27,7 @@ class InternalRequestDaemon:
|
|
|
26
27
|
"""Internal daemon that runs an event in the background."""
|
|
27
28
|
|
|
28
29
|
id: str
|
|
29
|
-
name:
|
|
30
|
+
name: request_names.RequestName
|
|
30
31
|
event_fn: Callable[[], None]
|
|
31
32
|
default_log_level: str = 'INFO'
|
|
32
33
|
should_skip: Callable[[], bool] = _default_should_skip
|
|
@@ -38,9 +39,11 @@ class InternalRequestDaemon:
|
|
|
38
39
|
try:
|
|
39
40
|
# Refresh config within the while loop.
|
|
40
41
|
# Since this is a long running daemon,
|
|
41
|
-
#
|
|
42
|
+
# reload_for_new_request()
|
|
42
43
|
# is not called in between the event runs.
|
|
43
|
-
|
|
44
|
+
# We don't need to grab the lock here because each of the daemons
|
|
45
|
+
# run in their own process and thus have their own request context.
|
|
46
|
+
skypilot_config.reload_config()
|
|
44
47
|
# Get the configured log level for the daemon inside the event loop
|
|
45
48
|
# in case the log level changes after the API server is started.
|
|
46
49
|
level_str = skypilot_config.get_nested(
|
|
@@ -193,26 +196,31 @@ INTERNAL_REQUEST_DAEMONS = [
|
|
|
193
196
|
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
194
197
|
# set to updated status automatically, without showing users the hint of
|
|
195
198
|
# cluster being stopped or down when `sky status -r` is called.
|
|
196
|
-
InternalRequestDaemon(
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
199
|
+
InternalRequestDaemon(
|
|
200
|
+
id='skypilot-status-refresh-daemon',
|
|
201
|
+
name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
|
|
202
|
+
event_fn=refresh_cluster_status_event,
|
|
203
|
+
default_log_level='DEBUG'),
|
|
200
204
|
# Volume status refresh daemon to update the volume status periodically.
|
|
201
|
-
InternalRequestDaemon(
|
|
202
|
-
|
|
203
|
-
|
|
205
|
+
InternalRequestDaemon(
|
|
206
|
+
id='skypilot-volume-status-refresh-daemon',
|
|
207
|
+
name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
|
|
208
|
+
event_fn=refresh_volume_status_event),
|
|
204
209
|
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
205
|
-
name=
|
|
210
|
+
name=request_names.RequestName.
|
|
211
|
+
REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
|
|
206
212
|
event_fn=managed_job_status_refresh_event,
|
|
207
213
|
should_skip=should_skip_managed_job_status_refresh),
|
|
208
|
-
InternalRequestDaemon(
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
InternalRequestDaemon(
|
|
215
|
+
id='sky-serve-status-refresh-daemon',
|
|
216
|
+
name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
|
|
217
|
+
event_fn=sky_serve_status_refresh_event,
|
|
218
|
+
should_skip=should_skip_sky_serve_status_refresh),
|
|
219
|
+
InternalRequestDaemon(
|
|
220
|
+
id='pool-status-refresh-daemon',
|
|
221
|
+
name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
|
|
222
|
+
event_fn=pool_status_refresh_event,
|
|
223
|
+
should_skip=should_skip_pool_status_refresh),
|
|
216
224
|
]
|
|
217
225
|
|
|
218
226
|
|
sky/server/requests/executor.py
CHANGED
|
@@ -47,7 +47,9 @@ from sky.server import metrics as metrics_lib
|
|
|
47
47
|
from sky.server.requests import payloads
|
|
48
48
|
from sky.server.requests import preconditions
|
|
49
49
|
from sky.server.requests import process
|
|
50
|
+
from sky.server.requests import request_names
|
|
50
51
|
from sky.server.requests import requests as api_requests
|
|
52
|
+
from sky.server.requests import threads
|
|
51
53
|
from sky.server.requests.queues import local_queue
|
|
52
54
|
from sky.server.requests.queues import mp_queue
|
|
53
55
|
from sky.skylet import constants
|
|
@@ -81,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
|
|
|
81
83
|
# platforms, including macOS.
|
|
82
84
|
multiprocessing.set_start_method('spawn', force=True)
|
|
83
85
|
|
|
86
|
+
# An upper limit of max threads for request execution per server process that
|
|
87
|
+
# unlikely to be reached to allow higher concurrency while still prevent the
|
|
88
|
+
# server process become overloaded.
|
|
89
|
+
_REQUEST_THREADS_LIMIT = 128
|
|
90
|
+
|
|
91
|
+
_REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
|
|
92
|
+
# A dedicated thread pool executor for synced requests execution in coroutine to
|
|
93
|
+
# avoid:
|
|
94
|
+
# 1. blocking the event loop;
|
|
95
|
+
# 2. exhausting the default thread pool executor of event loop;
|
|
96
|
+
_REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
|
|
100
|
+
"""Lazy init and return the request thread executor for current process."""
|
|
101
|
+
global _REQUEST_THREAD_EXECUTOR
|
|
102
|
+
if _REQUEST_THREAD_EXECUTOR is not None:
|
|
103
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
104
|
+
with _REQUEST_THREAD_EXECUTOR_LOCK:
|
|
105
|
+
if _REQUEST_THREAD_EXECUTOR is None:
|
|
106
|
+
_REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
|
|
107
|
+
name='request_thread_executor',
|
|
108
|
+
max_workers=_REQUEST_THREADS_LIMIT)
|
|
109
|
+
return _REQUEST_THREAD_EXECUTOR
|
|
110
|
+
|
|
84
111
|
|
|
85
112
|
class RequestQueue:
|
|
86
113
|
"""The queue for the requests, either redis or multiprocessing.
|
|
@@ -188,10 +215,11 @@ class RequestWorker:
|
|
|
188
215
|
time.sleep(0.1)
|
|
189
216
|
return
|
|
190
217
|
request_id, ignore_return_value, _ = request_element
|
|
191
|
-
request = api_requests.get_request(request_id)
|
|
218
|
+
request = api_requests.get_request(request_id, fields=['status'])
|
|
192
219
|
assert request is not None, f'Request with ID {request_id} is None'
|
|
193
220
|
if request.status == api_requests.RequestStatus.CANCELLED:
|
|
194
221
|
return
|
|
222
|
+
del request
|
|
195
223
|
logger.info(f'[{self}] Submitting request: {request_id}')
|
|
196
224
|
# Start additional process to run the request, so that it can be
|
|
197
225
|
# cancelled when requested by a user.
|
|
@@ -302,10 +330,7 @@ def override_request_env_and_config(
|
|
|
302
330
|
# through the execution.
|
|
303
331
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
304
332
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
305
|
-
global_user_state.add_or_update_user(user)
|
|
306
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
307
|
-
# field.
|
|
308
|
-
user = global_user_state.get_user(user.id)
|
|
333
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
309
334
|
|
|
310
335
|
# Force color to be enabled.
|
|
311
336
|
os.environ['CLICOLOR_FORCE'] = '1'
|
|
@@ -349,32 +374,6 @@ def override_request_env_and_config(
|
|
|
349
374
|
os.environ.update(original_env)
|
|
350
375
|
|
|
351
376
|
|
|
352
|
-
def _get_current_output() -> Tuple[int, int]:
|
|
353
|
-
"""Get the current stdout and stderr file descriptors."""
|
|
354
|
-
return os.dup(sys.stdout.fileno()), os.dup(sys.stderr.fileno())
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def _redirect_output(file: TextIO) -> None:
|
|
358
|
-
"""Redirect stdout and stderr to the log file."""
|
|
359
|
-
# Get the file descriptor from the file object
|
|
360
|
-
fd = file.fileno()
|
|
361
|
-
# Copy this fd to stdout and stderr
|
|
362
|
-
os.dup2(fd, sys.stdout.fileno())
|
|
363
|
-
os.dup2(fd, sys.stderr.fileno())
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
def _restore_output(original_stdout: Optional[int],
|
|
367
|
-
original_stderr: Optional[int]) -> None:
|
|
368
|
-
"""Restore stdout and stderr to their original file descriptors."""
|
|
369
|
-
if original_stdout is not None:
|
|
370
|
-
os.dup2(original_stdout, sys.stdout.fileno())
|
|
371
|
-
os.close(original_stdout)
|
|
372
|
-
|
|
373
|
-
if original_stderr is not None:
|
|
374
|
-
os.dup2(original_stderr, sys.stderr.fileno())
|
|
375
|
-
os.close(original_stderr)
|
|
376
|
-
|
|
377
|
-
|
|
378
377
|
def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
|
|
379
378
|
raise KeyboardInterrupt
|
|
380
379
|
|
|
@@ -397,11 +396,43 @@ def _request_execution_wrapper(request_id: str,
|
|
|
397
396
|
rss_begin = proc.memory_info().rss
|
|
398
397
|
db_utils.set_max_connections(num_db_connections_per_worker)
|
|
399
398
|
# Handle the SIGTERM signal to abort the request processing gracefully.
|
|
400
|
-
signal.signal(
|
|
399
|
+
# Only set up signal handlers in the main thread, as signal.signal() raises
|
|
400
|
+
# ValueError if called from a non-main thread (e.g., in tests).
|
|
401
|
+
if threading.current_thread() is threading.main_thread():
|
|
402
|
+
signal.signal(signal.SIGTERM, _sigterm_handler)
|
|
401
403
|
|
|
402
404
|
logger.info(f'Running request {request_id} with pid {pid}')
|
|
403
405
|
|
|
404
406
|
original_stdout = original_stderr = None
|
|
407
|
+
|
|
408
|
+
def _save_current_output() -> None:
|
|
409
|
+
"""Save the current stdout and stderr file descriptors."""
|
|
410
|
+
nonlocal original_stdout, original_stderr
|
|
411
|
+
original_stdout = os.dup(sys.stdout.fileno())
|
|
412
|
+
original_stderr = os.dup(sys.stderr.fileno())
|
|
413
|
+
|
|
414
|
+
def _redirect_output(file: TextIO) -> None:
|
|
415
|
+
"""Redirect stdout and stderr to the log file."""
|
|
416
|
+
# Get the file descriptor from the file object
|
|
417
|
+
fd = file.fileno()
|
|
418
|
+
# Copy this fd to stdout and stderr
|
|
419
|
+
os.dup2(fd, sys.stdout.fileno())
|
|
420
|
+
os.dup2(fd, sys.stderr.fileno())
|
|
421
|
+
|
|
422
|
+
def _restore_output() -> None:
|
|
423
|
+
"""Restore stdout and stderr to their original file descriptors."""
|
|
424
|
+
nonlocal original_stdout, original_stderr
|
|
425
|
+
if original_stdout is not None:
|
|
426
|
+
os.dup2(original_stdout, sys.stdout.fileno())
|
|
427
|
+
os.close(original_stdout)
|
|
428
|
+
original_stdout = None
|
|
429
|
+
|
|
430
|
+
if original_stderr is not None:
|
|
431
|
+
os.dup2(original_stderr, sys.stderr.fileno())
|
|
432
|
+
os.close(original_stderr)
|
|
433
|
+
original_stderr = None
|
|
434
|
+
|
|
435
|
+
request_name = None
|
|
405
436
|
try:
|
|
406
437
|
# As soon as the request is updated with the executor PID, we can
|
|
407
438
|
# receive SIGTERM from cancellation. So, we update the request inside
|
|
@@ -422,7 +453,7 @@ def _request_execution_wrapper(request_id: str,
|
|
|
422
453
|
# Store copies of the original stdout and stderr file descriptors
|
|
423
454
|
# We do this in two steps because we should make sure to restore the
|
|
424
455
|
# original values even if we are cancelled or fail during the redirect.
|
|
425
|
-
|
|
456
|
+
_save_current_output()
|
|
426
457
|
|
|
427
458
|
# Append to the log file instead of overwriting it since there might be
|
|
428
459
|
# logs from previous retries.
|
|
@@ -464,15 +495,14 @@ def _request_execution_wrapper(request_id: str,
|
|
|
464
495
|
# clear the pid of the request.
|
|
465
496
|
request_task.pid = None
|
|
466
497
|
# Yield control to the scheduler for uniform handling of retries.
|
|
467
|
-
_restore_output(
|
|
498
|
+
_restore_output()
|
|
468
499
|
raise
|
|
469
500
|
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
470
501
|
api_requests.set_request_failed(request_id, e)
|
|
471
502
|
# Manually reset the original stdout and stderr file descriptors early
|
|
472
503
|
# so that the "Request xxxx failed due to ..." log message will be
|
|
473
504
|
# written to the original stdout and stderr file descriptors.
|
|
474
|
-
_restore_output(
|
|
475
|
-
original_stdout = original_stderr = None
|
|
505
|
+
_restore_output()
|
|
476
506
|
logger.info(f'Request {request_id} failed due to '
|
|
477
507
|
f'{common_utils.format_exception(e)}')
|
|
478
508
|
return
|
|
@@ -482,11 +512,10 @@ def _request_execution_wrapper(request_id: str,
|
|
|
482
512
|
# Manually reset the original stdout and stderr file descriptors early
|
|
483
513
|
# so that the "Request xxxx failed due to ..." log message will be
|
|
484
514
|
# written to the original stdout and stderr file descriptors.
|
|
485
|
-
_restore_output(
|
|
486
|
-
original_stdout = original_stderr = None
|
|
515
|
+
_restore_output()
|
|
487
516
|
logger.info(f'Request {request_id} finished')
|
|
488
517
|
finally:
|
|
489
|
-
_restore_output(
|
|
518
|
+
_restore_output()
|
|
490
519
|
try:
|
|
491
520
|
# Capture the peak RSS before GC.
|
|
492
521
|
peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
|
|
@@ -495,7 +524,8 @@ def _request_execution_wrapper(request_id: str,
|
|
|
495
524
|
annotations.clear_request_level_cache()
|
|
496
525
|
with metrics_utils.time_it(name='release_memory', group='internal'):
|
|
497
526
|
common_utils.release_memory()
|
|
498
|
-
|
|
527
|
+
if request_name is not None:
|
|
528
|
+
_record_memory_metrics(request_name, proc, rss_begin, peak_rss)
|
|
499
529
|
except Exception as e: # pylint: disable=broad-except
|
|
500
530
|
logger.error(f'Failed to record memory metrics: '
|
|
501
531
|
f'{common_utils.format_exception(e)}')
|
|
@@ -539,6 +569,21 @@ class CoroutineTask:
|
|
|
539
569
|
pass
|
|
540
570
|
|
|
541
571
|
|
|
572
|
+
def check_request_thread_executor_available() -> None:
|
|
573
|
+
"""Check if the request thread executor is available.
|
|
574
|
+
|
|
575
|
+
This is a best effort check to hint the client to retry other server
|
|
576
|
+
processes when there is no avaiable thread worker in current one. But
|
|
577
|
+
a request may pass this check and still cannot get worker on execution
|
|
578
|
+
time due to race condition. In this case, the client will see a failed
|
|
579
|
+
request instead of retry.
|
|
580
|
+
|
|
581
|
+
TODO(aylei): this can be refined with a refactor of our coroutine
|
|
582
|
+
execution flow.
|
|
583
|
+
"""
|
|
584
|
+
get_request_thread_executor().check_available()
|
|
585
|
+
|
|
586
|
+
|
|
542
587
|
def execute_request_in_coroutine(
|
|
543
588
|
request: api_requests.Request) -> CoroutineTask:
|
|
544
589
|
"""Execute a request in current event loop.
|
|
@@ -553,6 +598,18 @@ def execute_request_in_coroutine(
|
|
|
553
598
|
return CoroutineTask(task)
|
|
554
599
|
|
|
555
600
|
|
|
601
|
+
def _execute_with_config_override(func: Callable,
|
|
602
|
+
request_body: payloads.RequestBody,
|
|
603
|
+
request_id: str, request_name: str,
|
|
604
|
+
**kwargs) -> Any:
|
|
605
|
+
"""Execute a function with env and config override inside a thread."""
|
|
606
|
+
# Override the environment and config within this thread's context,
|
|
607
|
+
# which gets copied when we call to_thread.
|
|
608
|
+
with override_request_env_and_config(request_body, request_id,
|
|
609
|
+
request_name):
|
|
610
|
+
return func(**kwargs)
|
|
611
|
+
|
|
612
|
+
|
|
556
613
|
async def _execute_request_coroutine(request: api_requests.Request):
|
|
557
614
|
"""Execute a request in current event loop.
|
|
558
615
|
|
|
@@ -566,39 +623,43 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
566
623
|
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
567
624
|
func = request.entrypoint
|
|
568
625
|
request_body = request.request_body
|
|
569
|
-
|
|
570
|
-
|
|
626
|
+
await api_requests.update_status_async(request.request_id,
|
|
627
|
+
api_requests.RequestStatus.RUNNING)
|
|
571
628
|
# Redirect stdout and stderr to the request log path.
|
|
572
629
|
original_output = ctx.redirect_log(request.log_path)
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
630
|
+
try:
|
|
631
|
+
fut: asyncio.Future = context_utils.to_thread_with_executor(
|
|
632
|
+
get_request_thread_executor(), _execute_with_config_override, func,
|
|
633
|
+
request_body, request.request_id, request.name,
|
|
634
|
+
**request_body.to_kwargs())
|
|
635
|
+
except Exception as e: # pylint: disable=broad-except
|
|
636
|
+
ctx.redirect_log(original_output)
|
|
637
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
638
|
+
logger.error(f'Failed to run request {request.request_id} due to '
|
|
639
|
+
f'{common_utils.format_exception(e)}')
|
|
640
|
+
return
|
|
581
641
|
|
|
582
642
|
async def poll_task(request_id: str) -> bool:
|
|
583
|
-
|
|
584
|
-
if
|
|
643
|
+
req_status = await api_requests.get_request_status_async(request_id)
|
|
644
|
+
if req_status is None:
|
|
585
645
|
raise RuntimeError('Request not found')
|
|
586
646
|
|
|
587
|
-
if
|
|
647
|
+
if req_status.status == api_requests.RequestStatus.CANCELLED:
|
|
588
648
|
ctx.cancel()
|
|
589
649
|
return True
|
|
590
650
|
|
|
591
651
|
if fut.done():
|
|
592
652
|
try:
|
|
593
653
|
result = await fut
|
|
594
|
-
api_requests.
|
|
654
|
+
await api_requests.set_request_succeeded_async(
|
|
655
|
+
request_id, result)
|
|
595
656
|
except asyncio.CancelledError:
|
|
596
657
|
# The task is cancelled by ctx.cancel(), where the status
|
|
597
658
|
# should already be set to CANCELLED.
|
|
598
659
|
pass
|
|
599
660
|
except Exception as e: # pylint: disable=broad-except
|
|
600
661
|
ctx.redirect_log(original_output)
|
|
601
|
-
api_requests.
|
|
662
|
+
await api_requests.set_request_failed_async(request_id, e)
|
|
602
663
|
logger.error(f'Request {request_id} failed due to '
|
|
603
664
|
f'{common_utils.format_exception(e)}')
|
|
604
665
|
return True
|
|
@@ -613,13 +674,13 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
613
674
|
except asyncio.CancelledError:
|
|
614
675
|
# Current coroutine is cancelled due to client disconnect, set the
|
|
615
676
|
# request status for consistency.
|
|
616
|
-
api_requests.
|
|
677
|
+
await api_requests.set_request_cancelled_async(request.request_id)
|
|
617
678
|
pass
|
|
618
679
|
# pylint: disable=broad-except
|
|
619
680
|
except (Exception, KeyboardInterrupt, SystemExit) as e:
|
|
620
681
|
# Handle any other error
|
|
621
682
|
ctx.redirect_log(original_output)
|
|
622
|
-
api_requests.
|
|
683
|
+
await api_requests.set_request_failed_async(request.request_id, e)
|
|
623
684
|
logger.error(f'Request {request.request_id} interrupted due to '
|
|
624
685
|
f'unhandled exception: {common_utils.format_exception(e)}')
|
|
625
686
|
raise
|
|
@@ -629,9 +690,9 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
629
690
|
ctx.cancel()
|
|
630
691
|
|
|
631
692
|
|
|
632
|
-
def
|
|
693
|
+
async def prepare_request_async(
|
|
633
694
|
request_id: str,
|
|
634
|
-
request_name:
|
|
695
|
+
request_name: request_names.RequestName,
|
|
635
696
|
request_body: payloads.RequestBody,
|
|
636
697
|
func: Callable[P, Any],
|
|
637
698
|
request_cluster_name: Optional[str] = None,
|
|
@@ -655,7 +716,7 @@ def prepare_request(
|
|
|
655
716
|
user_id=user_id,
|
|
656
717
|
cluster_name=request_cluster_name)
|
|
657
718
|
|
|
658
|
-
if not api_requests.
|
|
719
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
659
720
|
raise exceptions.RequestAlreadyExistsError(
|
|
660
721
|
f'Request {request_id} already exists.')
|
|
661
722
|
|
|
@@ -663,17 +724,18 @@ def prepare_request(
|
|
|
663
724
|
return request
|
|
664
725
|
|
|
665
726
|
|
|
666
|
-
def
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
727
|
+
async def schedule_request_async(request_id: str,
|
|
728
|
+
request_name: request_names.RequestName,
|
|
729
|
+
request_body: payloads.RequestBody,
|
|
730
|
+
func: Callable[P, Any],
|
|
731
|
+
request_cluster_name: Optional[str] = None,
|
|
732
|
+
ignore_return_value: bool = False,
|
|
733
|
+
schedule_type: api_requests.ScheduleType = (
|
|
734
|
+
api_requests.ScheduleType.LONG),
|
|
735
|
+
is_skypilot_system: bool = False,
|
|
736
|
+
precondition: Optional[
|
|
737
|
+
preconditions.Precondition] = None,
|
|
738
|
+
retryable: bool = False) -> None:
|
|
677
739
|
"""Enqueue a request to the request queue.
|
|
678
740
|
|
|
679
741
|
Args:
|
|
@@ -694,9 +756,11 @@ def schedule_request(request_id: str,
|
|
|
694
756
|
The precondition is waited asynchronously and does not block the
|
|
695
757
|
caller.
|
|
696
758
|
"""
|
|
697
|
-
request_task =
|
|
698
|
-
|
|
699
|
-
|
|
759
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
760
|
+
request_body, func,
|
|
761
|
+
request_cluster_name,
|
|
762
|
+
schedule_type,
|
|
763
|
+
is_skypilot_system)
|
|
700
764
|
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
701
765
|
retryable)
|
|
702
766
|
|
sky/server/requests/payloads.py
CHANGED
|
@@ -319,6 +319,8 @@ class StatusBody(RequestBody):
|
|
|
319
319
|
# Only return fields that are needed for the
|
|
320
320
|
# dashboard / CLI summary response
|
|
321
321
|
summary_response: bool = False
|
|
322
|
+
# Include the cluster handle in the response
|
|
323
|
+
include_handle: bool = True
|
|
322
324
|
|
|
323
325
|
|
|
324
326
|
class StartBody(RequestBody):
|
|
@@ -363,9 +365,10 @@ class CancelBody(RequestBody):
|
|
|
363
365
|
return kwargs
|
|
364
366
|
|
|
365
367
|
|
|
366
|
-
class
|
|
368
|
+
class ProvisionLogsBody(RequestBody):
|
|
367
369
|
"""Cluster node."""
|
|
368
370
|
cluster_name: str
|
|
371
|
+
worker: Optional[int] = None
|
|
369
372
|
|
|
370
373
|
|
|
371
374
|
class ClusterJobBody(RequestBody):
|
|
@@ -541,6 +544,9 @@ class JobsQueueV2Body(RequestBody):
|
|
|
541
544
|
page: Optional[int] = None
|
|
542
545
|
limit: Optional[int] = None
|
|
543
546
|
statuses: Optional[List[str]] = None
|
|
547
|
+
# The fields to return in the response.
|
|
548
|
+
# Refer to the fields in the `class ManagedJobRecord` in `response.py`
|
|
549
|
+
fields: Optional[List[str]] = None
|
|
544
550
|
|
|
545
551
|
|
|
546
552
|
class JobsCancelBody(RequestBody):
|
|
@@ -573,6 +579,8 @@ class RequestStatusBody(pydantic.BaseModel):
|
|
|
573
579
|
"""The request body for the API request status endpoint."""
|
|
574
580
|
request_ids: Optional[List[str]] = None
|
|
575
581
|
all_status: bool = False
|
|
582
|
+
limit: Optional[int] = None
|
|
583
|
+
fields: Optional[List[str]] = None
|
|
576
584
|
|
|
577
585
|
|
|
578
586
|
class ServeUpBody(RequestBody):
|