skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/server.py
CHANGED
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
import argparse
|
|
4
4
|
import asyncio
|
|
5
5
|
import base64
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
import contextlib
|
|
7
8
|
import datetime
|
|
9
|
+
from enum import IntEnum
|
|
8
10
|
import hashlib
|
|
9
11
|
import json
|
|
10
12
|
import multiprocessing
|
|
@@ -14,8 +16,10 @@ import posixpath
|
|
|
14
16
|
import re
|
|
15
17
|
import resource
|
|
16
18
|
import shutil
|
|
19
|
+
import struct
|
|
17
20
|
import sys
|
|
18
21
|
import threading
|
|
22
|
+
import traceback
|
|
19
23
|
from typing import Dict, List, Literal, Optional, Set, Tuple
|
|
20
24
|
import uuid
|
|
21
25
|
import zipfile
|
|
@@ -23,6 +27,7 @@ import zipfile
|
|
|
23
27
|
import aiofiles
|
|
24
28
|
import anyio
|
|
25
29
|
import fastapi
|
|
30
|
+
from fastapi import responses as fastapi_responses
|
|
26
31
|
from fastapi.middleware import cors
|
|
27
32
|
import starlette.middleware.base
|
|
28
33
|
import uvloop
|
|
@@ -41,6 +46,7 @@ from sky.data import storage_utils
|
|
|
41
46
|
from sky.jobs import utils as managed_job_utils
|
|
42
47
|
from sky.jobs.server import server as jobs_rest
|
|
43
48
|
from sky.metrics import utils as metrics_utils
|
|
49
|
+
from sky.provision import metadata_utils
|
|
44
50
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
45
51
|
from sky.schemas.api import responses
|
|
46
52
|
from sky.serve.server import server as serve_rest
|
|
@@ -58,6 +64,7 @@ from sky.server.auth import oauth2_proxy
|
|
|
58
64
|
from sky.server.requests import executor
|
|
59
65
|
from sky.server.requests import payloads
|
|
60
66
|
from sky.server.requests import preconditions
|
|
67
|
+
from sky.server.requests import request_names
|
|
61
68
|
from sky.server.requests import requests as requests_lib
|
|
62
69
|
from sky.skylet import constants
|
|
63
70
|
from sky.ssh_node_pools import server as ssh_node_pools_rest
|
|
@@ -73,6 +80,7 @@ from sky.utils import dag_utils
|
|
|
73
80
|
from sky.utils import perf_utils
|
|
74
81
|
from sky.utils import status_lib
|
|
75
82
|
from sky.utils import subprocess_utils
|
|
83
|
+
from sky.utils import ux_utils
|
|
76
84
|
from sky.utils.db import db_utils
|
|
77
85
|
from sky.volumes.server import server as volumes_rest
|
|
78
86
|
from sky.workspaces import server as workspaces_rest
|
|
@@ -159,7 +167,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
159
167
|
"""Middleware to add a request ID to each request."""
|
|
160
168
|
|
|
161
169
|
async def dispatch(self, request: fastapi.Request, call_next):
|
|
162
|
-
request_id =
|
|
170
|
+
request_id = requests_lib.get_new_request_id()
|
|
163
171
|
request.state.request_id = request_id
|
|
164
172
|
response = await call_next(request)
|
|
165
173
|
# TODO(syang): remove X-Request-ID when v0.10.0 is released.
|
|
@@ -451,11 +459,11 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
|
|
|
451
459
|
loop.call_at(target, tick)
|
|
452
460
|
|
|
453
461
|
|
|
454
|
-
def
|
|
462
|
+
async def schedule_on_boot_check_async():
|
|
455
463
|
try:
|
|
456
|
-
executor.
|
|
464
|
+
await executor.schedule_request_async(
|
|
457
465
|
request_id='skypilot-server-on-boot-check',
|
|
458
|
-
request_name=
|
|
466
|
+
request_name=request_names.RequestName.CHECK,
|
|
459
467
|
request_body=payloads.CheckBody(),
|
|
460
468
|
func=sky_check.check,
|
|
461
469
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -476,7 +484,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
476
484
|
if event.should_skip():
|
|
477
485
|
continue
|
|
478
486
|
try:
|
|
479
|
-
executor.
|
|
487
|
+
await executor.schedule_request_async(
|
|
480
488
|
request_id=event.id,
|
|
481
489
|
request_name=event.name,
|
|
482
490
|
request_body=payloads.RequestBody(),
|
|
@@ -491,7 +499,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
491
499
|
# Lifespan will be executed in each uvicorn worker process, we
|
|
492
500
|
# can safely ignore the error if the task is already scheduled.
|
|
493
501
|
logger.debug(f'Request {event.id} already exists.')
|
|
494
|
-
|
|
502
|
+
await schedule_on_boot_check_async()
|
|
495
503
|
asyncio.create_task(cleanup_upload_ids())
|
|
496
504
|
if metrics_utils.METRICS_ENABLED:
|
|
497
505
|
# Start monitoring the event loop lag in each server worker
|
|
@@ -663,6 +671,25 @@ except Exception: # pylint: disable=broad-except
|
|
|
663
671
|
pass # no issue, we will warn the user later if its too low
|
|
664
672
|
|
|
665
673
|
|
|
674
|
+
@app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
|
|
675
|
+
def handle_concurrent_worker_exhausted_error(
|
|
676
|
+
request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
|
|
677
|
+
del request # request is not used
|
|
678
|
+
# Print detailed error message to server log
|
|
679
|
+
logger.error('Concurrent worker exhausted: '
|
|
680
|
+
f'{common_utils.format_exception(e)}')
|
|
681
|
+
with ux_utils.enable_traceback():
|
|
682
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
|
683
|
+
# Return human readable error message to client
|
|
684
|
+
return fastapi.responses.JSONResponse(
|
|
685
|
+
status_code=503,
|
|
686
|
+
content={
|
|
687
|
+
'detail':
|
|
688
|
+
('The server has exhausted its concurrent worker limit. '
|
|
689
|
+
'Please try again or scale the server if the load persists.')
|
|
690
|
+
})
|
|
691
|
+
|
|
692
|
+
|
|
666
693
|
@app.get('/token')
|
|
667
694
|
async def token(request: fastapi.Request,
|
|
668
695
|
local_port: Optional[int] = None) -> fastapi.responses.Response:
|
|
@@ -706,9 +733,9 @@ async def token(request: fastapi.Request,
|
|
|
706
733
|
async def check(request: fastapi.Request,
|
|
707
734
|
check_body: payloads.CheckBody) -> None:
|
|
708
735
|
"""Checks enabled clouds."""
|
|
709
|
-
executor.
|
|
736
|
+
await executor.schedule_request_async(
|
|
710
737
|
request_id=request.state.request_id,
|
|
711
|
-
request_name=
|
|
738
|
+
request_name=request_names.RequestName.CHECK,
|
|
712
739
|
request_body=check_body,
|
|
713
740
|
func=sky_check.check,
|
|
714
741
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -720,9 +747,9 @@ async def enabled_clouds(request: fastapi.Request,
|
|
|
720
747
|
workspace: Optional[str] = None,
|
|
721
748
|
expand: bool = False) -> None:
|
|
722
749
|
"""Gets enabled clouds on the server."""
|
|
723
|
-
executor.
|
|
750
|
+
await executor.schedule_request_async(
|
|
724
751
|
request_id=request.state.request_id,
|
|
725
|
-
request_name=
|
|
752
|
+
request_name=request_names.RequestName.ENABLED_CLOUDS,
|
|
726
753
|
request_body=payloads.EnabledCloudsBody(workspace=workspace,
|
|
727
754
|
expand=expand),
|
|
728
755
|
func=core.enabled_clouds,
|
|
@@ -736,9 +763,10 @@ async def realtime_kubernetes_gpu_availability(
|
|
|
736
763
|
realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
|
|
737
764
|
) -> None:
|
|
738
765
|
"""Gets real-time Kubernetes GPU availability."""
|
|
739
|
-
executor.
|
|
766
|
+
await executor.schedule_request_async(
|
|
740
767
|
request_id=request.state.request_id,
|
|
741
|
-
request_name=
|
|
768
|
+
request_name=request_names.RequestName.
|
|
769
|
+
REALTIME_KUBERNETES_GPU_AVAILABILITY,
|
|
742
770
|
request_body=realtime_gpu_availability_body,
|
|
743
771
|
func=core.realtime_kubernetes_gpu_availability,
|
|
744
772
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -751,9 +779,9 @@ async def kubernetes_node_info(
|
|
|
751
779
|
kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
|
|
752
780
|
) -> None:
|
|
753
781
|
"""Gets Kubernetes nodes information and hints."""
|
|
754
|
-
executor.
|
|
782
|
+
await executor.schedule_request_async(
|
|
755
783
|
request_id=request.state.request_id,
|
|
756
|
-
request_name=
|
|
784
|
+
request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
|
|
757
785
|
request_body=kubernetes_node_info_body,
|
|
758
786
|
func=kubernetes_utils.get_kubernetes_node_info,
|
|
759
787
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -763,9 +791,9 @@ async def kubernetes_node_info(
|
|
|
763
791
|
@app.get('/status_kubernetes')
|
|
764
792
|
async def status_kubernetes(request: fastapi.Request) -> None:
|
|
765
793
|
"""Gets Kubernetes status."""
|
|
766
|
-
executor.
|
|
794
|
+
await executor.schedule_request_async(
|
|
767
795
|
request_id=request.state.request_id,
|
|
768
|
-
request_name=
|
|
796
|
+
request_name=request_names.RequestName.STATUS_KUBERNETES,
|
|
769
797
|
request_body=payloads.RequestBody(),
|
|
770
798
|
func=core.status_kubernetes,
|
|
771
799
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -777,9 +805,9 @@ async def list_accelerators(
|
|
|
777
805
|
request: fastapi.Request,
|
|
778
806
|
list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
|
|
779
807
|
"""Gets list of accelerators from cloud catalog."""
|
|
780
|
-
executor.
|
|
808
|
+
await executor.schedule_request_async(
|
|
781
809
|
request_id=request.state.request_id,
|
|
782
|
-
request_name=
|
|
810
|
+
request_name=request_names.RequestName.LIST_ACCELERATORS,
|
|
783
811
|
request_body=list_accelerator_counts_body,
|
|
784
812
|
func=catalog.list_accelerators,
|
|
785
813
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -792,9 +820,9 @@ async def list_accelerator_counts(
|
|
|
792
820
|
list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
|
|
793
821
|
) -> None:
|
|
794
822
|
"""Gets list of accelerator counts from cloud catalog."""
|
|
795
|
-
executor.
|
|
823
|
+
await executor.schedule_request_async(
|
|
796
824
|
request_id=request.state.request_id,
|
|
797
|
-
request_name=
|
|
825
|
+
request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
|
|
798
826
|
request_body=list_accelerator_counts_body,
|
|
799
827
|
func=catalog.list_accelerator_counts,
|
|
800
828
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -828,6 +856,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
828
856
|
# server thread.
|
|
829
857
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
830
858
|
dag,
|
|
859
|
+
request_name=request_names.AdminPolicyRequestName.VALIDATE,
|
|
831
860
|
request_options=validate_body.get_request_options()) as dag:
|
|
832
861
|
dag.resolve_and_validate_volumes()
|
|
833
862
|
# Skip validating workdir and file_mounts, as those need to be
|
|
@@ -849,9 +878,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
849
878
|
async def optimize(optimize_body: payloads.OptimizeBody,
|
|
850
879
|
request: fastapi.Request) -> None:
|
|
851
880
|
"""Optimizes the user's DAG."""
|
|
852
|
-
executor.
|
|
881
|
+
await executor.schedule_request_async(
|
|
853
882
|
request_id=request.state.request_id,
|
|
854
|
-
request_name=
|
|
883
|
+
request_name=request_names.RequestName.OPTIMIZE,
|
|
855
884
|
request_body=optimize_body,
|
|
856
885
|
ignore_return_value=True,
|
|
857
886
|
func=core.optimize,
|
|
@@ -1059,9 +1088,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1059
1088
|
"""Launches a cluster or task."""
|
|
1060
1089
|
request_id = request.state.request_id
|
|
1061
1090
|
logger.info(f'Launching request: {request_id}')
|
|
1062
|
-
executor.
|
|
1091
|
+
await executor.schedule_request_async(
|
|
1063
1092
|
request_id,
|
|
1064
|
-
request_name=
|
|
1093
|
+
request_name=request_names.RequestName.CLUSTER_LAUNCH,
|
|
1065
1094
|
request_body=launch_body,
|
|
1066
1095
|
func=execution.launch,
|
|
1067
1096
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1075,9 +1104,9 @@ async def launch(launch_body: payloads.LaunchBody,
|
|
|
1075
1104
|
async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
1076
1105
|
"""Executes a task on an existing cluster."""
|
|
1077
1106
|
cluster_name = exec_body.cluster_name
|
|
1078
|
-
executor.
|
|
1107
|
+
await executor.schedule_request_async(
|
|
1079
1108
|
request_id=request.state.request_id,
|
|
1080
|
-
request_name=
|
|
1109
|
+
request_name=request_names.RequestName.CLUSTER_EXEC,
|
|
1081
1110
|
request_body=exec_body,
|
|
1082
1111
|
func=execution.exec,
|
|
1083
1112
|
precondition=preconditions.ClusterStartCompletePrecondition(
|
|
@@ -1093,9 +1122,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
|
|
|
1093
1122
|
async def stop(request: fastapi.Request,
|
|
1094
1123
|
stop_body: payloads.StopOrDownBody) -> None:
|
|
1095
1124
|
"""Stops a cluster."""
|
|
1096
|
-
executor.
|
|
1125
|
+
await executor.schedule_request_async(
|
|
1097
1126
|
request_id=request.state.request_id,
|
|
1098
|
-
request_name=
|
|
1127
|
+
request_name=request_names.RequestName.CLUSTER_STOP,
|
|
1099
1128
|
request_body=stop_body,
|
|
1100
1129
|
func=core.stop,
|
|
1101
1130
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1113,9 +1142,9 @@ async def status(
|
|
|
1113
1142
|
raise fastapi.HTTPException(
|
|
1114
1143
|
status_code=503,
|
|
1115
1144
|
detail='Server is shutting down, please try again later.')
|
|
1116
|
-
executor.
|
|
1145
|
+
await executor.schedule_request_async(
|
|
1117
1146
|
request_id=request.state.request_id,
|
|
1118
|
-
request_name=
|
|
1147
|
+
request_name=request_names.RequestName.CLUSTER_STATUS,
|
|
1119
1148
|
request_body=status_body,
|
|
1120
1149
|
func=core.status,
|
|
1121
1150
|
schedule_type=(requests_lib.ScheduleType.LONG if
|
|
@@ -1128,9 +1157,9 @@ async def status(
|
|
|
1128
1157
|
async def endpoints(request: fastapi.Request,
|
|
1129
1158
|
endpoint_body: payloads.EndpointsBody) -> None:
|
|
1130
1159
|
"""Gets the endpoint for a given cluster and port number (endpoint)."""
|
|
1131
|
-
executor.
|
|
1160
|
+
await executor.schedule_request_async(
|
|
1132
1161
|
request_id=request.state.request_id,
|
|
1133
|
-
request_name=
|
|
1162
|
+
request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
|
|
1134
1163
|
request_body=endpoint_body,
|
|
1135
1164
|
func=core.endpoints,
|
|
1136
1165
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1142,9 +1171,9 @@ async def endpoints(request: fastapi.Request,
|
|
|
1142
1171
|
async def down(request: fastapi.Request,
|
|
1143
1172
|
down_body: payloads.StopOrDownBody) -> None:
|
|
1144
1173
|
"""Tears down a cluster."""
|
|
1145
|
-
executor.
|
|
1174
|
+
await executor.schedule_request_async(
|
|
1146
1175
|
request_id=request.state.request_id,
|
|
1147
|
-
request_name=
|
|
1176
|
+
request_name=request_names.RequestName.CLUSTER_DOWN,
|
|
1148
1177
|
request_body=down_body,
|
|
1149
1178
|
func=core.down,
|
|
1150
1179
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1156,9 +1185,9 @@ async def down(request: fastapi.Request,
|
|
|
1156
1185
|
async def start(request: fastapi.Request,
|
|
1157
1186
|
start_body: payloads.StartBody) -> None:
|
|
1158
1187
|
"""Restarts a cluster."""
|
|
1159
|
-
executor.
|
|
1188
|
+
await executor.schedule_request_async(
|
|
1160
1189
|
request_id=request.state.request_id,
|
|
1161
|
-
request_name=
|
|
1190
|
+
request_name=request_names.RequestName.CLUSTER_START,
|
|
1162
1191
|
request_body=start_body,
|
|
1163
1192
|
func=core.start,
|
|
1164
1193
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1170,9 +1199,9 @@ async def start(request: fastapi.Request,
|
|
|
1170
1199
|
async def autostop(request: fastapi.Request,
|
|
1171
1200
|
autostop_body: payloads.AutostopBody) -> None:
|
|
1172
1201
|
"""Schedules an autostop/autodown for a cluster."""
|
|
1173
|
-
executor.
|
|
1202
|
+
await executor.schedule_request_async(
|
|
1174
1203
|
request_id=request.state.request_id,
|
|
1175
|
-
request_name=
|
|
1204
|
+
request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
|
|
1176
1205
|
request_body=autostop_body,
|
|
1177
1206
|
func=core.autostop,
|
|
1178
1207
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1184,9 +1213,9 @@ async def autostop(request: fastapi.Request,
|
|
|
1184
1213
|
async def queue(request: fastapi.Request,
|
|
1185
1214
|
queue_body: payloads.QueueBody) -> None:
|
|
1186
1215
|
"""Gets the job queue of a cluster."""
|
|
1187
|
-
executor.
|
|
1216
|
+
await executor.schedule_request_async(
|
|
1188
1217
|
request_id=request.state.request_id,
|
|
1189
|
-
request_name=
|
|
1218
|
+
request_name=request_names.RequestName.CLUSTER_QUEUE,
|
|
1190
1219
|
request_body=queue_body,
|
|
1191
1220
|
func=core.queue,
|
|
1192
1221
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1198,9 +1227,9 @@ async def queue(request: fastapi.Request,
|
|
|
1198
1227
|
async def job_status(request: fastapi.Request,
|
|
1199
1228
|
job_status_body: payloads.JobStatusBody) -> None:
|
|
1200
1229
|
"""Gets the status of a job."""
|
|
1201
|
-
executor.
|
|
1230
|
+
await executor.schedule_request_async(
|
|
1202
1231
|
request_id=request.state.request_id,
|
|
1203
|
-
request_name=
|
|
1232
|
+
request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
|
|
1204
1233
|
request_body=job_status_body,
|
|
1205
1234
|
func=core.job_status,
|
|
1206
1235
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1212,9 +1241,9 @@ async def job_status(request: fastapi.Request,
|
|
|
1212
1241
|
async def cancel(request: fastapi.Request,
|
|
1213
1242
|
cancel_body: payloads.CancelBody) -> None:
|
|
1214
1243
|
"""Cancels jobs on a cluster."""
|
|
1215
|
-
executor.
|
|
1244
|
+
await executor.schedule_request_async(
|
|
1216
1245
|
request_id=request.state.request_id,
|
|
1217
|
-
request_name=
|
|
1246
|
+
request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
|
|
1218
1247
|
request_body=cancel_body,
|
|
1219
1248
|
func=core.cancel,
|
|
1220
1249
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1231,9 +1260,10 @@ async def logs(
|
|
|
1231
1260
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
|
1232
1261
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1233
1262
|
# request status.
|
|
1234
|
-
|
|
1263
|
+
executor.check_request_thread_executor_available()
|
|
1264
|
+
request_task = await executor.prepare_request_async(
|
|
1235
1265
|
request_id=request.state.request_id,
|
|
1236
|
-
request_name=
|
|
1266
|
+
request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
|
|
1237
1267
|
request_body=cluster_job_body,
|
|
1238
1268
|
func=core.tail_logs,
|
|
1239
1269
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1243,10 +1273,11 @@ async def logs(
|
|
|
1243
1273
|
background_tasks.add_task(task.cancel)
|
|
1244
1274
|
# TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
|
|
1245
1275
|
# the same approach as /stream.
|
|
1246
|
-
return stream_utils.
|
|
1276
|
+
return stream_utils.stream_response_for_long_request(
|
|
1247
1277
|
request_id=request.state.request_id,
|
|
1248
1278
|
logs_path=request_task.log_path,
|
|
1249
1279
|
background_tasks=background_tasks,
|
|
1280
|
+
kill_request_on_disconnect=False,
|
|
1250
1281
|
)
|
|
1251
1282
|
|
|
1252
1283
|
|
|
@@ -1261,9 +1292,9 @@ async def download_logs(
|
|
|
1261
1292
|
# We should reuse the original request body, so that the env vars, such as
|
|
1262
1293
|
# user hash, are kept the same.
|
|
1263
1294
|
cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
|
|
1264
|
-
executor.
|
|
1295
|
+
await executor.schedule_request_async(
|
|
1265
1296
|
request_id=request.state.request_id,
|
|
1266
|
-
request_name=
|
|
1297
|
+
request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
|
|
1267
1298
|
request_body=cluster_jobs_body,
|
|
1268
1299
|
func=core.download_logs,
|
|
1269
1300
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1340,38 +1371,65 @@ async def download(download_body: payloads.DownloadBody,
|
|
|
1340
1371
|
|
|
1341
1372
|
# TODO(aylei): run it asynchronously after global_user_state support async op
|
|
1342
1373
|
@app.post('/provision_logs')
|
|
1343
|
-
def provision_logs(
|
|
1374
|
+
def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
|
|
1344
1375
|
follow: bool = True,
|
|
1345
1376
|
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1346
1377
|
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1378
|
+
log_path = None
|
|
1379
|
+
cluster_name = provision_logs_body.cluster_name
|
|
1380
|
+
worker = provision_logs_body.worker
|
|
1381
|
+
# stream head node logs
|
|
1382
|
+
if worker is None:
|
|
1383
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1384
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1385
|
+
cluster_name)
|
|
1386
|
+
if not log_path_str:
|
|
1387
|
+
log_path_str = (
|
|
1388
|
+
global_user_state.get_cluster_history_provision_log_path(
|
|
1389
|
+
cluster_name))
|
|
1390
|
+
if not log_path_str:
|
|
1391
|
+
raise fastapi.HTTPException(
|
|
1392
|
+
status_code=404,
|
|
1393
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1394
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1395
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1396
|
+
if not log_path.exists():
|
|
1397
|
+
raise fastapi.HTTPException(
|
|
1398
|
+
status_code=404,
|
|
1399
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1358
1400
|
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1401
|
+
# stream worker node logs
|
|
1402
|
+
else:
|
|
1403
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
1404
|
+
if handle is None:
|
|
1405
|
+
raise fastapi.HTTPException(
|
|
1406
|
+
status_code=404,
|
|
1407
|
+
detail=('Cluster handle is not recorded for this cluster. '
|
|
1408
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1409
|
+
# instance_ids includes head node
|
|
1410
|
+
instance_ids = handle.instance_ids
|
|
1411
|
+
if instance_ids is None:
|
|
1412
|
+
raise fastapi.HTTPException(
|
|
1413
|
+
status_code=400,
|
|
1414
|
+
detail='Instance IDs are not recorded for this cluster. '
|
|
1415
|
+
'Please relaunch to generate provisioning logs.')
|
|
1416
|
+
if worker > len(instance_ids) - 1:
|
|
1417
|
+
raise fastapi.HTTPException(
|
|
1418
|
+
status_code=400,
|
|
1419
|
+
detail=f'Worker {worker} is out of range. '
|
|
1420
|
+
f'The cluster has {len(instance_ids)} nodes.')
|
|
1421
|
+
log_path = metadata_utils.get_instance_log_dir(
|
|
1422
|
+
handle.get_cluster_name_on_cloud(), instance_ids[worker])
|
|
1364
1423
|
|
|
1365
1424
|
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1366
1425
|
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1367
1426
|
|
|
1368
1427
|
return fastapi.responses.StreamingResponse(
|
|
1369
|
-
content=stream_utils.log_streamer(
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
cluster_name=cluster_body.cluster_name),
|
|
1428
|
+
content=stream_utils.log_streamer(None,
|
|
1429
|
+
log_path,
|
|
1430
|
+
tail=effective_tail,
|
|
1431
|
+
follow=follow,
|
|
1432
|
+
cluster_name=cluster_name),
|
|
1375
1433
|
media_type='text/plain',
|
|
1376
1434
|
headers={
|
|
1377
1435
|
'Cache-Control': 'no-cache, no-transform',
|
|
@@ -1385,9 +1443,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
|
1385
1443
|
async def cost_report(request: fastapi.Request,
|
|
1386
1444
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
1387
1445
|
"""Gets the cost report of a cluster."""
|
|
1388
|
-
executor.
|
|
1446
|
+
await executor.schedule_request_async(
|
|
1389
1447
|
request_id=request.state.request_id,
|
|
1390
|
-
request_name=
|
|
1448
|
+
request_name=request_names.RequestName.CLUSTER_COST_REPORT,
|
|
1391
1449
|
request_body=cost_report_body,
|
|
1392
1450
|
func=core.cost_report,
|
|
1393
1451
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1397,9 +1455,9 @@ async def cost_report(request: fastapi.Request,
|
|
|
1397
1455
|
@app.get('/storage/ls')
|
|
1398
1456
|
async def storage_ls(request: fastapi.Request) -> None:
|
|
1399
1457
|
"""Gets the storages."""
|
|
1400
|
-
executor.
|
|
1458
|
+
await executor.schedule_request_async(
|
|
1401
1459
|
request_id=request.state.request_id,
|
|
1402
|
-
request_name=
|
|
1460
|
+
request_name=request_names.RequestName.STORAGE_LS,
|
|
1403
1461
|
request_body=payloads.RequestBody(),
|
|
1404
1462
|
func=core.storage_ls,
|
|
1405
1463
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1410,9 +1468,9 @@ async def storage_ls(request: fastapi.Request) -> None:
|
|
|
1410
1468
|
async def storage_delete(request: fastapi.Request,
|
|
1411
1469
|
storage_body: payloads.StorageBody) -> None:
|
|
1412
1470
|
"""Deletes a storage."""
|
|
1413
|
-
executor.
|
|
1471
|
+
await executor.schedule_request_async(
|
|
1414
1472
|
request_id=request.state.request_id,
|
|
1415
|
-
request_name=
|
|
1473
|
+
request_name=request_names.RequestName.STORAGE_DELETE,
|
|
1416
1474
|
request_body=storage_body,
|
|
1417
1475
|
func=core.storage_delete,
|
|
1418
1476
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1423,9 +1481,9 @@ async def storage_delete(request: fastapi.Request,
|
|
|
1423
1481
|
async def local_up(request: fastapi.Request,
|
|
1424
1482
|
local_up_body: payloads.LocalUpBody) -> None:
|
|
1425
1483
|
"""Launches a Kubernetes cluster on API server."""
|
|
1426
|
-
executor.
|
|
1484
|
+
await executor.schedule_request_async(
|
|
1427
1485
|
request_id=request.state.request_id,
|
|
1428
|
-
request_name=
|
|
1486
|
+
request_name=request_names.RequestName.LOCAL_UP,
|
|
1429
1487
|
request_body=local_up_body,
|
|
1430
1488
|
func=core.local_up,
|
|
1431
1489
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
@@ -1436,19 +1494,36 @@ async def local_up(request: fastapi.Request,
|
|
|
1436
1494
|
async def local_down(request: fastapi.Request,
|
|
1437
1495
|
local_down_body: payloads.LocalDownBody) -> None:
|
|
1438
1496
|
"""Tears down the Kubernetes cluster started by local_up."""
|
|
1439
|
-
executor.
|
|
1497
|
+
await executor.schedule_request_async(
|
|
1440
1498
|
request_id=request.state.request_id,
|
|
1441
|
-
request_name=
|
|
1499
|
+
request_name=request_names.RequestName.LOCAL_DOWN,
|
|
1442
1500
|
request_body=local_down_body,
|
|
1443
1501
|
func=core.local_down,
|
|
1444
1502
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
1445
1503
|
)
|
|
1446
1504
|
|
|
1447
1505
|
|
|
1506
|
+
async def get_expanded_request_id(request_id: str) -> str:
|
|
1507
|
+
"""Gets the expanded request ID for a given request ID prefix."""
|
|
1508
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1509
|
+
request_id, fields=['request_id'])
|
|
1510
|
+
if request_tasks is None:
|
|
1511
|
+
raise fastapi.HTTPException(status_code=404,
|
|
1512
|
+
detail=f'Request {request_id!r} not found')
|
|
1513
|
+
if len(request_tasks) > 1:
|
|
1514
|
+
raise fastapi.HTTPException(status_code=400,
|
|
1515
|
+
detail=('Multiple requests found for '
|
|
1516
|
+
f'request ID prefix: {request_id}'))
|
|
1517
|
+
return request_tasks[0].request_id
|
|
1518
|
+
|
|
1519
|
+
|
|
1448
1520
|
# === API server related APIs ===
|
|
1449
|
-
@app.get('/api/get')
|
|
1521
|
+
@app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
|
|
1450
1522
|
async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
1451
1523
|
"""Gets a request with a given request ID prefix."""
|
|
1524
|
+
# Validate request_id prefix matches a single request.
|
|
1525
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1526
|
+
|
|
1452
1527
|
while True:
|
|
1453
1528
|
req_status = await requests_lib.get_request_status_async(request_id)
|
|
1454
1529
|
if req_status is None:
|
|
@@ -1465,6 +1540,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
|
|
|
1465
1540
|
# to avoid storming the DB and CPU in the meantime
|
|
1466
1541
|
await asyncio.sleep(0.1)
|
|
1467
1542
|
request_task = await requests_lib.get_request_async(request_id)
|
|
1543
|
+
# TODO(aylei): refine this, /api/get will not be retried and this is
|
|
1544
|
+
# meaningless to retry. It is the original request that should be retried.
|
|
1468
1545
|
if request_task.should_retry:
|
|
1469
1546
|
raise fastapi.HTTPException(
|
|
1470
1547
|
status_code=503, detail=f'Request {request_id!r} should be retried')
|
|
@@ -1506,13 +1583,18 @@ async def stream(
|
|
|
1506
1583
|
clients, console for CLI/API clients), 'plain' (force plain text),
|
|
1507
1584
|
'html' (force HTML), or 'console' (force console)
|
|
1508
1585
|
"""
|
|
1586
|
+
# We need to save the user-supplied request ID for the response header.
|
|
1587
|
+
user_supplied_request_id = request_id
|
|
1509
1588
|
if request_id is not None and log_path is not None:
|
|
1510
1589
|
raise fastapi.HTTPException(
|
|
1511
1590
|
status_code=400,
|
|
1512
1591
|
detail='Only one of request_id and log_path can be provided')
|
|
1513
1592
|
|
|
1593
|
+
if request_id is not None:
|
|
1594
|
+
request_id = await get_expanded_request_id(request_id)
|
|
1595
|
+
|
|
1514
1596
|
if request_id is None and log_path is None:
|
|
1515
|
-
request_id = requests_lib.
|
|
1597
|
+
request_id = await requests_lib.get_latest_request_id_async()
|
|
1516
1598
|
if request_id is None:
|
|
1517
1599
|
raise fastapi.HTTPException(status_code=404,
|
|
1518
1600
|
detail='No request found')
|
|
@@ -1539,13 +1621,17 @@ async def stream(
|
|
|
1539
1621
|
'X-Accel-Buffering': 'no'
|
|
1540
1622
|
})
|
|
1541
1623
|
|
|
1624
|
+
polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
|
|
1542
1625
|
# Original plain text streaming logic
|
|
1543
1626
|
if request_id is not None:
|
|
1544
|
-
request_task = await requests_lib.get_request_async(
|
|
1627
|
+
request_task = await requests_lib.get_request_async(
|
|
1628
|
+
request_id, fields=['request_id', 'schedule_type'])
|
|
1545
1629
|
if request_task is None:
|
|
1546
1630
|
print(f'No task with request ID {request_id}')
|
|
1547
1631
|
raise fastapi.HTTPException(
|
|
1548
1632
|
status_code=404, detail=f'Request {request_id!r} not found')
|
|
1633
|
+
# req.log_path is derived from request_id,
|
|
1634
|
+
# so it's ok to just grab the request_id in the above query.
|
|
1549
1635
|
log_path_to_stream = request_task.log_path
|
|
1550
1636
|
if not log_path_to_stream.exists():
|
|
1551
1637
|
# The log file might be deleted by the request GC daemon but the
|
|
@@ -1553,6 +1639,9 @@ async def stream(
|
|
|
1553
1639
|
raise fastapi.HTTPException(
|
|
1554
1640
|
status_code=404,
|
|
1555
1641
|
detail=f'Log of request {request_id!r} has been deleted')
|
|
1642
|
+
if request_task.schedule_type == requests_lib.ScheduleType.LONG:
|
|
1643
|
+
polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
|
|
1644
|
+
del request_task
|
|
1556
1645
|
else:
|
|
1557
1646
|
assert log_path is not None, (request_id, log_path)
|
|
1558
1647
|
if log_path == constants.API_SERVER_LOGS:
|
|
@@ -1593,14 +1682,17 @@ async def stream(
|
|
|
1593
1682
|
'Transfer-Encoding': 'chunked'
|
|
1594
1683
|
}
|
|
1595
1684
|
if request_id is not None:
|
|
1596
|
-
headers[server_constants.STREAM_REQUEST_HEADER] =
|
|
1685
|
+
headers[server_constants.STREAM_REQUEST_HEADER] = (
|
|
1686
|
+
user_supplied_request_id
|
|
1687
|
+
if user_supplied_request_id else request_id)
|
|
1597
1688
|
|
|
1598
1689
|
return fastapi.responses.StreamingResponse(
|
|
1599
1690
|
content=stream_utils.log_streamer(request_id,
|
|
1600
1691
|
log_path_to_stream,
|
|
1601
1692
|
plain_logs=format == 'plain',
|
|
1602
1693
|
tail=tail,
|
|
1603
|
-
follow=follow
|
|
1694
|
+
follow=follow,
|
|
1695
|
+
polling_interval=polling_interval),
|
|
1604
1696
|
media_type='text/plain',
|
|
1605
1697
|
headers=headers,
|
|
1606
1698
|
)
|
|
@@ -1610,11 +1702,11 @@ async def stream(
|
|
|
1610
1702
|
async def api_cancel(request: fastapi.Request,
|
|
1611
1703
|
request_cancel_body: payloads.RequestCancelBody) -> None:
|
|
1612
1704
|
"""Cancels requests."""
|
|
1613
|
-
executor.
|
|
1705
|
+
await executor.schedule_request_async(
|
|
1614
1706
|
request_id=request.state.request_id,
|
|
1615
|
-
request_name=
|
|
1707
|
+
request_name=request_names.RequestName.API_CANCEL,
|
|
1616
1708
|
request_body=request_cancel_body,
|
|
1617
|
-
func=requests_lib.
|
|
1709
|
+
func=requests_lib.kill_requests_with_prefix,
|
|
1618
1710
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
1619
1711
|
)
|
|
1620
1712
|
|
|
@@ -1622,9 +1714,13 @@ async def api_cancel(request: fastapi.Request,
|
|
|
1622
1714
|
@app.get('/api/status')
|
|
1623
1715
|
async def api_status(
|
|
1624
1716
|
request_ids: Optional[List[str]] = fastapi.Query(
|
|
1625
|
-
None, description='Request
|
|
1717
|
+
None, description='Request ID prefixes to get status for.'),
|
|
1626
1718
|
all_status: bool = fastapi.Query(
|
|
1627
1719
|
False, description='Get finished requests as well.'),
|
|
1720
|
+
limit: Optional[int] = fastapi.Query(
|
|
1721
|
+
None, description='Number of requests to show.'),
|
|
1722
|
+
fields: Optional[List[str]] = fastapi.Query(
|
|
1723
|
+
None, description='Fields to get. If None, get all fields.'),
|
|
1628
1724
|
) -> List[payloads.RequestPayload]:
|
|
1629
1725
|
"""Gets the list of requests."""
|
|
1630
1726
|
if request_ids is None:
|
|
@@ -1635,15 +1731,22 @@ async def api_status(
|
|
|
1635
1731
|
requests_lib.RequestStatus.RUNNING,
|
|
1636
1732
|
]
|
|
1637
1733
|
request_tasks = await requests_lib.get_request_tasks_async(
|
|
1638
|
-
req_filter=requests_lib.RequestTaskFilter(
|
|
1639
|
-
|
|
1734
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
1735
|
+
status=statuses,
|
|
1736
|
+
limit=limit,
|
|
1737
|
+
fields=fields,
|
|
1738
|
+
sort=True,
|
|
1739
|
+
))
|
|
1740
|
+
return requests_lib.encode_requests(request_tasks)
|
|
1640
1741
|
else:
|
|
1641
1742
|
encoded_request_tasks = []
|
|
1642
1743
|
for request_id in request_ids:
|
|
1643
|
-
|
|
1644
|
-
|
|
1744
|
+
request_tasks = await requests_lib.get_requests_async_with_prefix(
|
|
1745
|
+
request_id)
|
|
1746
|
+
if request_tasks is None:
|
|
1645
1747
|
continue
|
|
1646
|
-
|
|
1748
|
+
for request_task in request_tasks:
|
|
1749
|
+
encoded_request_tasks.append(request_task.readable_encode())
|
|
1647
1750
|
return encoded_request_tasks
|
|
1648
1751
|
|
|
1649
1752
|
|
|
@@ -1703,23 +1806,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1703
1806
|
version=sky.__version__,
|
|
1704
1807
|
version_on_disk=common.get_skypilot_version_on_disk(),
|
|
1705
1808
|
commit=sky.__commit__,
|
|
1809
|
+
# Whether basic auth on api server is enabled
|
|
1706
1810
|
basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
|
|
1707
1811
|
'false').lower() == 'true',
|
|
1708
1812
|
user=user if user is not None else None,
|
|
1813
|
+
# Whether service account token is enabled
|
|
1814
|
+
service_account_token_enabled=(os.environ.get(
|
|
1815
|
+
constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
|
|
1816
|
+
'false').lower() == 'true'),
|
|
1817
|
+
# Whether basic auth on ingress is enabled
|
|
1818
|
+
ingress_basic_auth_enabled=os.environ.get(
|
|
1819
|
+
constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
|
|
1820
|
+
'false').lower() == 'true',
|
|
1709
1821
|
)
|
|
1710
1822
|
|
|
1711
1823
|
|
|
1824
|
+
class KubernetesSSHMessageType(IntEnum):
|
|
1825
|
+
REGULAR_DATA = 0
|
|
1826
|
+
PINGPONG = 1
|
|
1827
|
+
LATENCY_MEASUREMENT = 2
|
|
1828
|
+
|
|
1829
|
+
|
|
1712
1830
|
@app.websocket('/kubernetes-pod-ssh-proxy')
|
|
1713
|
-
async def kubernetes_pod_ssh_proxy(
|
|
1714
|
-
|
|
1831
|
+
async def kubernetes_pod_ssh_proxy(
|
|
1832
|
+
websocket: fastapi.WebSocket,
|
|
1833
|
+
cluster_name: str,
|
|
1834
|
+
client_version: Optional[int] = None) -> None:
|
|
1715
1835
|
"""Proxies SSH to the Kubernetes pod with websocket."""
|
|
1716
1836
|
await websocket.accept()
|
|
1717
1837
|
logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
|
|
1718
1838
|
|
|
1839
|
+
timestamps_supported = client_version is not None and client_version > 21
|
|
1840
|
+
logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
|
|
1841
|
+
client_version = {client_version}')
|
|
1842
|
+
|
|
1719
1843
|
# Run core.status in another thread to avoid blocking the event loop.
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1844
|
+
with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
|
|
1845
|
+
cluster_records = await context_utils.to_thread_with_executor(
|
|
1846
|
+
thread_pool_executor, core.status, cluster_name, all_users=True)
|
|
1723
1847
|
cluster_record = cluster_records[0]
|
|
1724
1848
|
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
|
1725
1849
|
raise fastapi.HTTPException(
|
|
@@ -1770,6 +1894,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1770
1894
|
async def websocket_to_ssh():
|
|
1771
1895
|
try:
|
|
1772
1896
|
async for message in websocket.iter_bytes():
|
|
1897
|
+
if timestamps_supported:
|
|
1898
|
+
type_size = struct.calcsize('!B')
|
|
1899
|
+
message_type = struct.unpack('!B',
|
|
1900
|
+
message[:type_size])[0]
|
|
1901
|
+
if (message_type ==
|
|
1902
|
+
KubernetesSSHMessageType.REGULAR_DATA):
|
|
1903
|
+
# Regular data - strip type byte and forward to SSH
|
|
1904
|
+
message = message[type_size:]
|
|
1905
|
+
elif message_type == KubernetesSSHMessageType.PINGPONG:
|
|
1906
|
+
# PING message - respond with PONG (type 1)
|
|
1907
|
+
ping_id_size = struct.calcsize('!I')
|
|
1908
|
+
if len(message) != type_size + ping_id_size:
|
|
1909
|
+
raise ValueError('Invalid PING message '
|
|
1910
|
+
f'length: {len(message)}')
|
|
1911
|
+
# Return the same PING message, so that the client
|
|
1912
|
+
# can measure the latency.
|
|
1913
|
+
await websocket.send_bytes(message)
|
|
1914
|
+
continue
|
|
1915
|
+
elif (message_type ==
|
|
1916
|
+
KubernetesSSHMessageType.LATENCY_MEASUREMENT):
|
|
1917
|
+
# Latency measurement from client
|
|
1918
|
+
latency_size = struct.calcsize('!Q')
|
|
1919
|
+
if len(message) != type_size + latency_size:
|
|
1920
|
+
raise ValueError(
|
|
1921
|
+
'Invalid latency measurement '
|
|
1922
|
+
f'message length: {len(message)}')
|
|
1923
|
+
avg_latency_ms = struct.unpack(
|
|
1924
|
+
'!Q',
|
|
1925
|
+
message[type_size:type_size + latency_size])[0]
|
|
1926
|
+
latency_seconds = avg_latency_ms / 1000
|
|
1927
|
+
metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
|
|
1928
|
+
continue
|
|
1929
|
+
else:
|
|
1930
|
+
# Unknown message type.
|
|
1931
|
+
raise ValueError(
|
|
1932
|
+
f'Unknown message type: {message_type}')
|
|
1773
1933
|
writer.write(message)
|
|
1774
1934
|
try:
|
|
1775
1935
|
await writer.drain()
|
|
@@ -1800,6 +1960,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1800
1960
|
nonlocal ssh_failed
|
|
1801
1961
|
ssh_failed = True
|
|
1802
1962
|
break
|
|
1963
|
+
if timestamps_supported:
|
|
1964
|
+
# Prepend message type byte (0 = regular data)
|
|
1965
|
+
message_type_bytes = struct.pack(
|
|
1966
|
+
'!B', KubernetesSSHMessageType.REGULAR_DATA.value)
|
|
1967
|
+
data = message_type_bytes + data
|
|
1803
1968
|
await websocket.send_bytes(data)
|
|
1804
1969
|
except Exception: # pylint: disable=broad-except
|
|
1805
1970
|
pass
|
|
@@ -1837,9 +2002,9 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
|
|
|
1837
2002
|
async def all_contexts(request: fastapi.Request) -> None:
|
|
1838
2003
|
"""Gets all Kubernetes and SSH node pool contexts."""
|
|
1839
2004
|
|
|
1840
|
-
executor.
|
|
2005
|
+
await executor.schedule_request_async(
|
|
1841
2006
|
request_id=request.state.request_id,
|
|
1842
|
-
request_name=
|
|
2007
|
+
request_name=request_names.RequestName.ALL_CONTEXTS,
|
|
1843
2008
|
request_body=payloads.RequestBody(),
|
|
1844
2009
|
func=core.get_all_contexts,
|
|
1845
2010
|
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
@@ -1967,6 +2132,19 @@ if __name__ == '__main__':
|
|
|
1967
2132
|
logger.error(f'Port {cmd_args.port} is not available, exiting.')
|
|
1968
2133
|
raise RuntimeError(f'Port {cmd_args.port} is not available')
|
|
1969
2134
|
|
|
2135
|
+
# Maybe touch the signal file on API server startup. Do it again here even
|
|
2136
|
+
# if we already touched it in the sky/server/common.py::_start_api_server.
|
|
2137
|
+
# This is because the sky/server/common.py::_start_api_server function call
|
|
2138
|
+
# is running outside the skypilot API server process tree. The process tree
|
|
2139
|
+
# starts within that function (see the `subprocess.Popen` call in
|
|
2140
|
+
# sky/server/common.py::_start_api_server). When pg is used, the
|
|
2141
|
+
# _start_api_server function will not load the config file from db, which
|
|
2142
|
+
# will ignore the consolidation mode config. Here, inside the process tree,
|
|
2143
|
+
# we already reload the config as a server (with env var _start_api_server),
|
|
2144
|
+
# so we will respect the consolidation mode config.
|
|
2145
|
+
# Refers to #7717 for more details.
|
|
2146
|
+
managed_job_utils.is_consolidation_mode(on_api_restart=True)
|
|
2147
|
+
|
|
1970
2148
|
# Show the privacy policy if it is not already shown. We place it here so
|
|
1971
2149
|
# that it is shown only when the API server is started.
|
|
1972
2150
|
usage_lib.maybe_show_privacy_policy()
|
|
@@ -2014,7 +2192,8 @@ if __name__ == '__main__':
|
|
|
2014
2192
|
uvicorn_config = uvicorn.Config('sky.server.server:app',
|
|
2015
2193
|
host=cmd_args.host,
|
|
2016
2194
|
port=cmd_args.port,
|
|
2017
|
-
workers=num_workers
|
|
2195
|
+
workers=num_workers,
|
|
2196
|
+
ws_per_message_deflate=False)
|
|
2018
2197
|
skyuvicorn.run(uvicorn_config,
|
|
2019
2198
|
max_db_connections=config.num_db_connections_per_worker)
|
|
2020
2199
|
except Exception as exc: # pylint: disable=broad-except
|